PyPI - rlgym-learn-algos - Versions diffs - 0.1.5__cp310-cp310-win32.whl → 0.2.0__cp310-cp310-win32.whl - Mend

rlgym-learn-algos 0.1.5cp310-cp310-win32.whl → 0.2.0cp310-cp310-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

rlgym_learn_algos/logging/metrics_logger.py CHANGED Viewed

@@ -12,9 +12,9 @@ MetricsLoggerAdditionalDerivedConfig = TypeVar("MetricsLoggerAdditionalDerivedCo
 class DerivedMetricsLoggerConfig(
     Generic[MetricsLoggerConfig, MetricsLoggerAdditionalDerivedConfig]
 ):
+    metrics_logger_config: MetricsLoggerConfig = None
     checkpoint_load_folder: Optional[str] = None
     agent_controller_name: str = ""
-    metrics_logger_config: MetricsLoggerConfig = None
     additional_derived_config: MetricsLoggerAdditionalDerivedConfig = None

rlgym_learn_algos/logging/wandb_metrics_logger.py CHANGED Viewed

@@ -29,7 +29,7 @@ def convert_nested_dict(d):
     return new
-class WandbMetricsLoggerConfigModel(BaseModel):
+class WandbMetricsLoggerConfigModel(BaseModel, extra="forbid"):
     enable: bool = True
     project: str = "rlgym-learn"
     group: str = "unnamed-runs"
@@ -37,6 +37,7 @@ class WandbMetricsLoggerConfigModel(BaseModel):
     id: Optional[str] = None
     new_run_with_timestamp_suffix: bool = False
     additional_wandb_run_config: Dict[str, Any] = Field(default_factory=dict)
+    settings_kwargs: Dict[str, Any] = Field(default_factory=dict)
 @dataclass
@@ -76,6 +77,7 @@ class WandbMetricsLogger(
     ):
         self.inner_metrics_logger = inner_metrics_logger
         self.checkpoint_file_name = checkpoint_file_name
+        self.run_id = None
     def collect_env_metrics(self, data: List[Dict[str, Any]]):
         self.inner_metrics_logger.collect_env_metrics(data)
@@ -107,17 +109,11 @@ class WandbMetricsLogger(
             self.run_id = None
             return
-        if (
-            self.config.checkpoint_load_folder is not None
-            and self.config.metrics_logger_config.id is not None
-        ):
-            if self.run_id is not None:
-                print(
-                    f"{self.config.agent_controller_name}: Wandb run id from checkpoint ({self.run_id}) is being overridden by wandb run id from config: {self.config.metrics_logger_config.id}"
-                )
+        if self.run_id is not None and self.config.metrics_logger_config.id is not None:
+            print(
+                f"{self.config.agent_controller_name}: Wandb run id from checkpoint ({self.run_id}) is being overridden by wandb run id from config: {self.config.metrics_logger_config.id}"
+            )
             self.run_id = self.config.metrics_logger_config.id
-        else:
-            self.run_id = None
         wandb_config = {
             **self.config.additional_derived_config.derived_wandb_run_config,
@@ -145,22 +141,31 @@ class WandbMetricsLogger(
             id=self.run_id,
             resume="allow",
             reinit=True,
+            settings=wandb.Settings(
+                **self.config.metrics_logger_config.settings_kwargs
+            ),
         )
         self.run_id = self.wandb_run.id
         print(f"{self.config.agent_controller_name}: Created wandb run! {self.run_id}")
     def _load_from_checkpoint(self):
-        with open(
-            os.path.join(
-                self.config.checkpoint_load_folder,
-                self.checkpoint_file_name,
-            ),
-            "rt",
-        ) as f:
-            state = json.load(f)
-        if "run_id" in state:
-            self.run_id = state["run_id"]
-        else:
+        try:
+            with open(
+                os.path.join(
+                    self.config.checkpoint_load_folder,
+                    self.checkpoint_file_name,
+                ),
+                "rt",
+            ) as f:
+                state = json.load(f)
+            if "run_id" in state:
+                self.run_id = state["run_id"]
+            else:
+                self.run_id = None
+        except FileNotFoundError:
+            print(
+                f"{self.config.agent_controller_name}: Tried to load from checkpoint, but checkpoint didn't contain a wandb run! A new run will be created based on the config values."
+            )
             self.run_id = None
     def save_checkpoint(self, folder_path):

rlgym_learn_algos/ppo/experience_buffer.py CHANGED Viewed

@@ -8,6 +8,9 @@ import torch
 from pydantic import BaseModel, Field, model_validator
 from rlgym.api import ActionType, AgentID, ObsType, RewardType
+from rlgym_learn_algos.util.torch_functions import get_device
+from rlgym_learn_algos.util.torch_pydantic import PydanticTorchDevice
 from .trajectory import Trajectory
 from .trajectory_processor import (
     DerivedTrajectoryProcessorConfig,
@@ -19,8 +22,9 @@ from .trajectory_processor import (
 EXPERIENCE_BUFFER_FILE = "experience_buffer.pkl"
-class ExperienceBufferConfigModel(BaseModel):
+class ExperienceBufferConfigModel(BaseModel, extra="forbid"):
     max_size: int = 100000
+    device: PydanticTorchDevice = "auto"
     trajectory_processor_config: Dict[str, Any] = Field(default_factory=dict)
     @model_validator(mode="before")
@@ -31,21 +35,35 @@ class ExperienceBufferConfigModel(BaseModel):
                 data.trajectory_processor_config = (
                     data.trajectory_processor_config.model_dump()
                 )
-        elif isinstance(data, dict) and "trajectory_processor_config" in data:
-            if isinstance(data["trajectory_processor_config"], BaseModel):
-                data["trajectory_processor_config"] = data[
-                    "trajectory_processor_config"
-                ].model_dump()
+        elif isinstance(data, dict):
+            if "trajectory_processor_config" in data:
+                if isinstance(data["trajectory_processor_config"], BaseModel):
+                    data["trajectory_processor_config"] = data[
+                        "trajectory_processor_config"
+                    ].model_dump()
+            if "device" not in data or data["device"] == "auto":
+                data["device"] = get_device("auto")
         return data
+    # device: PydanticTorchDevice = "auto"
+    # @model_validator(mode="before")
+    # @classmethod
+    # def set_device(cls, data):
+    #     if isinstance(data, dict) and (
+    #         "device" not in data or data["device"] == "auto"
+    #     ):
+    #         data["device"] = get_device("auto")
+    #     return data
 @dataclass
 class DerivedExperienceBufferConfig:
-    max_size: int
+    experience_buffer_config: ExperienceBufferConfigModel
+    agent_controller_name: str
     seed: int
-    dtype: str
-    device: str
-    trajectory_processor_config: Dict[str, Any]
+    dtype: torch.dtype
+    learner_device: torch.device
     checkpoint_load_folder: Optional[str] = None
@@ -111,42 +129,50 @@ class ExperienceBuffer(
         self.agent_ids: List[AgentID] = []
         self.observations: List[ObsType] = []
         self.actions: List[ActionType] = []
-        self.log_probs = torch.FloatTensor()
-        self.values = torch.FloatTensor()
-        self.advantages = torch.FloatTensor()
     def load(self, config: DerivedExperienceBufferConfig):
         self.config = config
         self.rng = np.random.RandomState(config.seed)
         trajectory_processor_config = self.trajectory_processor.validate_config(
-            config.trajectory_processor_config
+            config.experience_buffer_config.trajectory_processor_config
         )
         self.trajectory_processor.load(
             DerivedTrajectoryProcessorConfig(
                 trajectory_processor_config=trajectory_processor_config,
                 dtype=config.dtype,
-                device=config.device,
+                device=config.learner_device,
             )
         )
+        self.log_probs = torch.tensor([], dtype=config.dtype)
+        self.values = torch.tensor([], dtype=config.dtype)
+        self.advantages = torch.tensor([], dtype=config.dtype)
         if self.config.checkpoint_load_folder is not None:
             self._load_from_checkpoint()
-        self.log_probs = self.log_probs.to(config.device)
-        self.values = self.values.to(config.device)
-        self.advantages = self.advantages.to(config.device)
+        self.log_probs = self.log_probs.to(config.learner_device)
+        self.values = self.values.to(config.learner_device)
+        self.advantages = self.advantages.to(config.learner_device)
     def _load_from_checkpoint(self):
         # lazy way
-        with open(
-            os.path.join(self.config.checkpoint_load_folder, EXPERIENCE_BUFFER_FILE),
-            "rb",
-        ) as f:
-            state_dict = pickle.load(f)
-        self.agent_ids = state_dict["agent_ids"]
-        self.observations = state_dict["observations"]
-        self.actions = state_dict["actions"]
-        self.log_probs = state_dict["log_probs"]
-        self.values = state_dict["values"]
-        self.advantages = state_dict["advantages"]
+        # TODO: don't use pickle for torch things, use torch.load because of map_location. Or maybe define a custom unpickler for this? Or maybe one already exists?
+        try:
+            with open(
+                os.path.join(
+                    self.config.checkpoint_load_folder, EXPERIENCE_BUFFER_FILE
+                ),
+                "rb",
+            ) as f:
+                state_dict = pickle.load(f)
+            self.agent_ids = state_dict["agent_ids"]
+            self.observations = state_dict["observations"]
+            self.actions = state_dict["actions"]
+            self.log_probs = state_dict["log_probs"]
+            self.values = state_dict["values"]
+            self.advantages = state_dict["advantages"]
+        except FileNotFoundError:
+            print(
+                f"{self.config.agent_controller_name}: Tried to load from checkpoint, but checkpoint didn't contain a saved experience buffer! A blank experience buffer will be used instead."
+            )
     def save_checkpoint(self, folder_path):
         os.makedirs(folder_path, exist_ok=True)
@@ -195,29 +221,36 @@ class ExperienceBuffer(
             exp_buffer_data
         )
-        self.agent_ids = _cat_list(self.agent_ids, agent_ids, self.config.max_size)
+        self.agent_ids = _cat_list(
+            self.agent_ids, agent_ids, self.config.experience_buffer_config.max_size
+        )
         self.observations = _cat_list(
-            self.observations, observations, self.config.max_size
+            self.observations,
+            observations,
+            self.config.experience_buffer_config.max_size,
+        )
+        self.actions = _cat_list(
+            self.actions, actions, self.config.experience_buffer_config.max_size
         )
-        self.actions = _cat_list(self.actions, actions, self.config.max_size)
         self.log_probs = _cat(
             self.log_probs,
             log_probs,
-            self.config.max_size,
+            self.config.experience_buffer_config.max_size,
         )
         self.values = _cat(
             self.values,
             values,
-            self.config.max_size,
+            self.config.experience_buffer_config.max_size,
         )
         self.advantages = _cat(
             self.advantages,
             advantages,
-            self.config.max_size,
+            self.config.experience_buffer_config.max_size,
         )
         return trajectory_processor_data
+    # TODO: tensordict?
     def _get_samples(self, indices) -> Tuple[
         Iterable[AgentID],
         Iterable[ObsType],
@@ -242,18 +275,14 @@ class ExperienceBuffer(
         :param batch_size: size of each batch yielded by the generator.
         :return:
         """
-        if self.config.device != "cpu":
+        if self.config.learner_device.type != "cpu":
             torch.cuda.current_stream().synchronize()
         total_samples = self.values.shape[0]
         indices = self.rng.permutation(total_samples)
         start_idx = 0
-        batches = []
         while start_idx + batch_size <= total_samples:
-            batches.append(
-                self._get_samples(indices[start_idx : start_idx + batch_size])
-            )
+            yield self._get_samples(indices[start_idx : start_idx + batch_size])
             start_idx += batch_size
-        return batches
     def clear(self):
         """
@@ -265,4 +294,4 @@ class ExperienceBuffer(
         del self.log_probs
         del self.values
         del self.advantages
-        self.__init__(self.max_size, self.seed, self.device)
+        self.__init__(self.trajectory_processor)

rlgym_learn_algos/ppo/experience_buffer_numpy.py CHANGED Viewed

@@ -76,25 +76,31 @@ class NumpyExperienceBuffer(
             exp_buffer_data
         )
-        self.agent_ids = _cat_list(self.agent_ids, agent_ids, self.config.max_size)
+        self.agent_ids = _cat_list(
+            self.agent_ids, agent_ids, self.config.experience_buffer_config.max_size
+        )
         self.observations = _cat_numpy(
-            self.observations, observations, self.config.max_size
+            self.observations,
+            observations,
+            self.config.experience_buffer_config.max_size,
+        )
+        self.actions = _cat_numpy(
+            self.actions, actions, self.config.experience_buffer_config.max_size
         )
-        self.actions = _cat_numpy(self.actions, actions, self.config.max_size)
         self.log_probs = _cat(
             self.log_probs,
             log_probs,
-            self.config.max_size,
+            self.config.experience_buffer_config.max_size,
         )
         self.values = _cat(
             self.values,
             values,
-            self.config.max_size,
+            self.config.experience_buffer_config.max_size,
         )
         self.advantages = _cat(
             self.advantages,
             advantages,
-            self.config.max_size,
+            self.config.experience_buffer_config.max_size,
         )
         return trajectory_processor_data
@@ -116,18 +122,14 @@ class NumpyExperienceBuffer(
         :param batch_size: size of each batch yielded by the generator.
         :return:
         """
-        if self.config.device != "cpu":
+        if self.config.experience_buffer_config.device.type != "cpu":
             torch.cuda.current_stream().synchronize()
         total_samples = self.values.shape[0]
         indices = self.rng.permutation(total_samples)
         start_idx = 0
-        batches = []
         while start_idx + batch_size <= total_samples:
-            batches.append(
-                self._get_samples(indices[start_idx : start_idx + batch_size])
-            )
+            yield self._get_samples(indices[start_idx : start_idx + batch_size])
             start_idx += batch_size
-        return batches
     def clear(self):
         """

rlgym_learn_algos/ppo/gae_trajectory_processor.py CHANGED Viewed

@@ -20,7 +20,7 @@ from ..ppo import RustDerivedGAETrajectoryProcessorConfig, RustGAETrajectoryProc
 from .trajectory_processor import TRAJECTORY_PROCESSOR_FILE, TrajectoryProcessor
-class GAETrajectoryProcessorConfigModel(BaseModel):
+class GAETrajectoryProcessorConfigModel(BaseModel, extra="forbid"):
     gamma: float = 0.99
     lmbda: float = 0.95
     standardize_returns: bool = True
@@ -122,7 +122,7 @@ class GAETrajectoryProcessor(
             self._load_from_checkpoint()
         self.rust_gae_trajectory_processor.load(
             RustDerivedGAETrajectoryProcessorConfig(
-                self.gamma, self.lmbda, np.dtype(self.dtype)
+                self.gamma, self.lmbda, np.dtype(str(self.dtype)[6:])
             )
         )
@@ -132,19 +132,10 @@ class GAETrajectoryProcessor(
             "rt",
         ) as f:
             state = json.load(f)
-        # TODO: why are these 4 getting saved/loaded?? They should just come from config
-        self.gamma = state["gamma"]
-        self.lmbda = state["lambda"]
-        self.standardize_returns = state["standardize_returns"]
-        self.max_returns_per_stats_increment = state["max_returns_per_stats_increment"]
         self.return_stats.load_state_dict(state["return_running_stats"])
     def save_checkpoint(self, folder_path):
         state = {
-            "gamma": self.gamma,
-            "lambda": self.lmbda,
-            "standardize_returns": self.standardize_returns,
-            "max_returns_per_stats_increment": self.max_returns_per_stats_increment,
             "return_running_stats": self.return_stats.state_dict(),
         }
         with open(

rlgym_learn_algos/ppo/gae_trajectory_processor_pure_python.py CHANGED Viewed

@@ -161,18 +161,10 @@ class GAETrajectoryProcessorPurePython(
             "rt",
         ) as f:
             state = json.load(f)
-        self.gamma = state["gamma"]
-        self.lmbda = state["lambda"]
-        self.standardize_returns = state["standardize_returns"]
-        self.max_returns_per_stats_increment = state["max_returns_per_stats_increment"]
         self.return_stats.load_state_dict(state["return_running_stats"])
     def save_checkpoint(self, folder_path):
         state = {
-            "gamma": self.gamma,
-            "lambda": self.lmbda,
-            "standardize_returns": self.standardize_returns,
-            "max_returns_per_stats_increment": self.max_returns_per_stats_increment,
             "return_running_stats": self.return_stats.state_dict(),
         }
         with open(

rlgym_learn_algos/ppo/ppo_agent_controller.py CHANGED Viewed

@@ -24,8 +24,6 @@ from rlgym.api import (
 )
 from rlgym_learn import EnvActionResponse, EnvActionResponseType, Timestep
 from rlgym_learn.api.agent_controller import AgentController
-from torch import device as _device
 from rlgym_learn_algos.logging import (
     DerivedMetricsLoggerConfig,
     MetricsLogger,
@@ -36,6 +34,7 @@ from rlgym_learn_algos.logging import (
 )
 from rlgym_learn_algos.stateful_functions import ObsStandardizer
 from rlgym_learn_algos.util.torch_functions import get_device
+from torch import device as _device
 from .actor import Actor
 from .critic import Critic
@@ -62,15 +61,13 @@ ITERATION_SHARED_INFOS_FILE = "iteration_shared_infos.pkl"
 CURRENT_TRAJECTORIES_FILE = "current_trajectories.pkl"
-class PPOAgentControllerConfigModel(BaseModel):
+class PPOAgentControllerConfigModel(BaseModel, extra="forbid"):
     timesteps_per_iteration: int = 50000
     save_every_ts: int = 1_000_000
     add_unix_timestamp: bool = True
     checkpoint_load_folder: Optional[str] = None
     n_checkpoints_to_keep: int = 5
     random_seed: int = 123
-    dtype: str = "float32"
-    device: Optional[str] = None
     learner_config: PPOLearnerConfigModel = Field(default_factory=PPOLearnerConfigModel)
     experience_buffer_config: ExperienceBufferConfigModel = Field(
         default_factory=ExperienceBufferConfigModel
@@ -190,11 +187,9 @@ class PPOAgentController(
     def load(self, config):
         self.config = config
-        device = config.agent_controller_config.device
-        if device is None:
-            device = config.base_config.device
-        self.device = get_device(device)
-        print(f"{self.config.agent_controller_name}: Using device {self.device}")
+        print(
+            f"{self.config.agent_controller_name}: Using device {config.agent_controller_config.learner_config.device}"
+        )
         agent_controller_config = config.agent_controller_config
         learner_config = config.agent_controller_config.learner_config
         experience_buffer_config = (
@@ -234,14 +229,14 @@ class PPOAgentController(
             # TODO: this doesn't seem to be working
             if abs_save_folder == loaded_checkpoint_runs_folder:
                 print(
-                    "Using the loaded checkpoint's run folder as the checkpoints save folder."
+                    f"{config.agent_controller_name}: Using the loaded checkpoint's run folder as the checkpoints save folder."
                 )
                 checkpoints_save_folder = os.path.abspath(
                     os.path.join(agent_controller_config.checkpoint_load_folder, "..")
                 )
             else:
                 print(
-                    "Runs folder in config does not align with loaded checkpoint's runs folder. Creating new run in the config-based runs folder."
+                    f"{config.agent_controller_name}: Runs folder in config does not align with loaded checkpoint's runs folder. Creating new run in the config-based runs folder."
                 )
                 checkpoints_save_folder = os.path.join(
                     config.save_folder, agent_controller_config.run_name + run_suffix
@@ -257,26 +252,19 @@ class PPOAgentController(
         self.learner.load(
             DerivedPPOLearnerConfig(
+                learner_config=learner_config,
                 obs_space=self.obs_space,
                 action_space=self.action_space,
-                n_epochs=learner_config.n_epochs,
-                batch_size=learner_config.batch_size,
-                n_minibatches=learner_config.n_minibatches,
-                ent_coef=learner_config.ent_coef,
-                clip_range=learner_config.clip_range,
-                actor_lr=learner_config.actor_lr,
-                critic_lr=learner_config.critic_lr,
-                device=self.device,
                 checkpoint_load_folder=learner_checkpoint_load_folder,
             )
         )
         self.experience_buffer.load(
             DerivedExperienceBufferConfig(
-                max_size=experience_buffer_config.max_size,
-                seed=agent_controller_config.random_seed,
-                dtype=agent_controller_config.dtype,
-                device=self.device,
-                trajectory_processor_config=experience_buffer_config.trajectory_processor_config,
+                experience_buffer_config=experience_buffer_config,
+                agent_controller_name=config.agent_controller_name,
+                seed=config.base_config.random_seed,
+                dtype=agent_controller_config.learner_config.dtype,
+                learner_device=agent_controller_config.learner_config.device,
                 checkpoint_load_folder=experience_buffer_checkpoint_load_folder,
             )
         )
@@ -301,9 +289,9 @@ class PPOAgentController(
                 additional_derived_config = None
             self.metrics_logger.load(
                 DerivedMetricsLoggerConfig(
+                    metrics_logger_config=metrics_logger_config,
                     checkpoint_load_folder=metrics_logger_checkpoint_load_folder,
                     agent_controller_name=config.agent_controller_name,
-                    metrics_logger_config=metrics_logger_config,
                     additional_derived_config=additional_derived_config,
                 )
             )
@@ -465,6 +453,7 @@ class PPOAgentController(
         ):
             self.timestep_collection_end_time = time.perf_counter()
             self._learn()
+            self.cur_iteration += 1
         if self.ts_since_last_save >= self.config.agent_controller_config.save_every_ts:
             self.save_checkpoint()
             self.ts_since_last_save = 0
@@ -563,5 +552,5 @@ class PPOAgentController(
         for idx, (start, stop) in enumerate(traj_timestep_idx_ranges):
             self.current_trajectories[idx].val_preds = val_preds[start : stop - 1]
             self.current_trajectories[idx].final_val_pred = val_preds[stop - 1]
-        if self.device != "cpu":
+        if self.config.agent_controller_config.learner_config.device.type != "cpu":
             torch.cuda.current_stream().synchronize()

rlgym_learn_algos/ppo/ppo_learner.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Generic, Optional
 import numpy as np
 import torch
-from pydantic import BaseModel
+from pydantic import BaseModel, field_serializer, model_validator
 from rlgym.api import (
     ActionSpaceType,
     ActionType,
@@ -16,6 +16,11 @@ from rlgym.api import (
     ObsType,
     RewardType,
 )
+from rlgym_learn_algos.util.torch_functions import get_device
+from rlgym_learn_algos.util.torch_pydantic import (
+    PydanticTorchDevice,
+    PydanticTorchDtype,
+)
 from torch import nn as nn
 from .actor import Actor
@@ -24,7 +29,8 @@ from .experience_buffer import ExperienceBuffer
 from .trajectory_processor import TrajectoryProcessorConfig, TrajectoryProcessorData
-class PPOLearnerConfigModel(BaseModel):
+class PPOLearnerConfigModel(BaseModel, extra="forbid"):
+    dtype: PydanticTorchDtype = torch.float32
     n_epochs: int = 1
     batch_size: int = 50000
     n_minibatches: int = 1
@@ -32,20 +38,45 @@ class PPOLearnerConfigModel(BaseModel):
     clip_range: float = 0.2
     actor_lr: float = 3e-4
     critic_lr: float = 3e-4
+    device: PydanticTorchDevice = "auto"
+    @model_validator(mode="before")
+    @classmethod
+    def set_device(cls, data):
+        if isinstance(data, dict) and (
+            "device" not in data or data["device"] == "auto"
+        ):
+            data["device"] = get_device("auto")
+        return data
+# @model_validator(mode="before")
+# @classmethod
+# def set_agent_controllers_config(cls, data):
+#     if isinstance(data, LearningCoordinatorConfigModel):
+#         agent_controllers_config = {}
+#         for k, v in data.agent_controllers_config.items():
+#             if isinstance(v, BaseModel):
+#                 agent_controllers_config[k] = v.model_dump()
+#             else:
+#                 agent_controllers_config[k] = v
+#         data.agent_controllers_config = agent_controllers_config
+#     elif isinstance(data, dict) and "agent_controllers_config" in data:
+#         agent_controllers_config = {}
+#         for k, v in data["agent_controllers_config"].items():
+#             if isinstance(v, BaseModel):
+#                 agent_controllers_config[k] = v.model_dump()
+#             else:
+#                 agent_controllers_config[k] = v
+#         data["agent_controllers_config"] = agent_controllers_config
+#     return data
 @dataclass
 class DerivedPPOLearnerConfig:
+    learner_config: PPOLearnerConfigModel
     obs_space: ObsSpaceType
     action_space: ActionSpaceType
-    n_epochs: int = 10
-    batch_size: int = 50000
-    n_minibatches: int = 1
-    ent_coef: float = 0.005
-    clip_range: float = 0.2
-    actor_lr: float = 3e-4
-    critic_lr: float = 3e-4
-    device: str = "auto"
     checkpoint_load_folder: Optional[str] = None
@@ -97,15 +128,17 @@ class PPOLearner(
         self.config = config
         self.actor = self.actor_factory(
-            config.obs_space, config.action_space, config.device
+            config.obs_space, config.action_space, config.learner_config.device
+        )
+        self.critic = self.critic_factory(
+            config.obs_space, config.learner_config.device
         )
-        self.critic = self.critic_factory(config.obs_space, config.device)
         self.actor_optimizer = torch.optim.Adam(
-            self.actor.parameters(), lr=self.config.actor_lr
+            self.actor.parameters(), lr=self.config.learner_config.actor_lr
         )
         self.critic_optimizer = torch.optim.Adam(
-            self.critic.parameters(), lr=self.config.critic_lr
+            self.critic.parameters(), lr=self.config.learner_config.critic_lr
         )
         self.critic_loss_fn = torch.nn.MSELoss()
@@ -130,14 +163,17 @@ class PPOLearner(
         print("-" * 20)
         print(f"{'Total':<10} {total_parameters:<10}")
-        print(f"Current Policy Learning Rate: {self.config.actor_lr}")
-        print(f"Current Critic Learning Rate: {self.config.critic_lr}")
+        print(f"Current Policy Learning Rate: {self.config.learner_config.actor_lr}")
+        print(f"Current Critic Learning Rate: {self.config.learner_config.critic_lr}")
         self.cumulative_model_updates = 0
         if self.config.checkpoint_load_folder is not None:
             self._load_from_checkpoint()
         self.minibatch_size = int(
-            np.ceil(self.config.batch_size / self.config.n_minibatches)
+            np.ceil(
+                self.config.learner_config.batch_size
+                / self.config.learner_config.n_minibatches
+            )
         )
     def _load_from_checkpoint(self):
@@ -147,19 +183,27 @@ class PPOLearner(
         ), f"PPO Learner cannot find folder: {self.config.checkpoint_load_folder}"
         self.actor.load_state_dict(
-            torch.load(os.path.join(self.config.checkpoint_load_folder, ACTOR_FILE))
+            torch.load(
+                os.path.join(self.config.checkpoint_load_folder, ACTOR_FILE),
+                map_location=self.config.learner_config.device,
+            )
         )
         self.critic.load_state_dict(
-            torch.load(os.path.join(self.config.checkpoint_load_folder, CRITIC_FILE))
+            torch.load(
+                os.path.join(self.config.checkpoint_load_folder, CRITIC_FILE),
+                map_location=self.config.learner_config.device,
+            )
         )
         self.actor_optimizer.load_state_dict(
             torch.load(
-                os.path.join(self.config.checkpoint_load_folder, ACTOR_OPTIMIZER_FILE)
+                os.path.join(self.config.checkpoint_load_folder, ACTOR_OPTIMIZER_FILE),
+                map_location=self.config.learner_config.device,
             )
         )
         self.critic_optimizer.load_state_dict(
             torch.load(
-                os.path.join(self.config.checkpoint_load_folder, CRITIC_OPTIMIZER_FILE)
+                os.path.join(self.config.checkpoint_load_folder, CRITIC_OPTIMIZER_FILE),
+                map_location=self.config.learner_config.device,
             )
         )
         with open(
@@ -215,9 +259,11 @@ class PPOLearner(
         critic_before = torch.nn.utils.parameters_to_vector(self.critic.parameters())
         t1 = time.time()
-        for epoch in range(self.config.n_epochs):
+        for epoch in range(self.config.learner_config.n_epochs):
             # Get all shuffled batches from the experience buffer.
-            batches = exp.get_all_batches_shuffled(self.config.batch_size)
+            batches = exp.get_all_batches_shuffled(
+                self.config.learner_config.batch_size
+            )
             for batch in batches:
                 (
                     batch_agent_ids,
@@ -232,20 +278,29 @@ class PPOLearner(
                 self.critic_optimizer.zero_grad()
                 for minibatch_slice in range(
-                    0, self.config.batch_size, self.minibatch_size
+                    0, self.config.learner_config.batch_size, self.minibatch_size
                 ):
                     # Send everything to the device and enforce correct shapes.
                     start = minibatch_slice
-                    stop = min(start + self.minibatch_size, self.config.batch_size)
-                    minibatch_ratio = (stop - start) / self.config.batch_size
+                    stop = min(
+                        start + self.minibatch_size,
+                        self.config.learner_config.batch_size,
+                    )
+                    minibatch_ratio = (
+                        stop - start
+                    ) / self.config.learner_config.batch_size
                     agent_ids = batch_agent_ids[start:stop]
                     obs = batch_obs[start:stop]
                     acts = batch_acts[start:stop]
-                    advantages = batch_advantages[start:stop].to(self.config.device)
-                    old_probs = batch_old_probs[start:stop].to(self.config.device)
+                    advantages = batch_advantages[start:stop].to(
+                        self.config.learner_config.device
+                    )
+                    old_probs = batch_old_probs[start:stop].to(
+                        self.config.learner_config.device
+                    )
                     target_values = batch_target_values[start:stop].to(
-                        self.config.device
+                        self.config.learner_config.device
                     )
                     # Compute value estimates.
@@ -262,8 +317,8 @@ class PPOLearner(
                     ratio = torch.exp(log_probs - old_probs)
                     clipped = torch.clamp(
                         ratio,
-                        1.0 - self.config.clip_range,
-                        1.0 + self.config.clip_range,
+                        1.0 - self.config.learner_config.clip_range,
+                        1.0 + self.config.learner_config.clip_range,
                     )
                     # Compute KL divergence & clip fraction using SB3 method for reporting.
@@ -274,7 +329,10 @@ class PPOLearner(
                         # From the stable-baselines3 implementation of PPO.
                         clip_fraction = torch.mean(
-                            (torch.abs(ratio - 1) > self.config.clip_range).float()
+                            (
+                                torch.abs(ratio - 1)
+                                > self.config.learner_config.clip_range
+                            ).float()
                         ).to(device="cpu", non_blocking=True)
                         clip_fractions.append((clip_fraction, minibatch_ratio))
@@ -285,7 +343,9 @@ class PPOLearner(
                     value_loss = (
                         self.critic_loss_fn(vals, target_values) * minibatch_ratio
                     )
-                    ppo_loss = actor_loss - entropy * self.config.ent_coef
+                    ppo_loss = (
+                        actor_loss - entropy * self.config.learner_config.ent_coef
+                    )
                     ppo_loss.backward()
                     value_loss.backward()
@@ -312,7 +372,7 @@ class PPOLearner(
         actor_update_magnitude = (actor_before - actor_after).norm().cpu().item()
         critic_update_magnitude = (critic_before - critic_after).norm().cpu().item()
-        if self.config.device != "cpu":
+        if self.config.learner_config.device.type != "cpu":
             torch.cuda.current_stream().synchronize()
         tot_clip = sum(

rlgym_learn_algos/ppo/trajectory_processor.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 from typing import Any, Dict, Generic, List, Optional, Tuple, TypeVar
 from rlgym.api import ActionType, AgentID, ObsType, RewardType
-from torch import Tensor
+from torch import Tensor, device, dtype
 from .trajectory import Trajectory
@@ -16,8 +16,8 @@ TRAJECTORY_PROCESSOR_FILE = "trajectory_processor.json"
 @dataclass
 class DerivedTrajectoryProcessorConfig(Generic[TrajectoryProcessorConfig]):
     trajectory_processor_config: TrajectoryProcessorConfig
-    dtype: str
-    device: str
+    dtype: dtype
+    device: device
     checkpoint_load_folder: Optional[str] = None

rlgym_learn_algos/rlgym_learn_algos.cp310-win32.pyd CHANGED Viewed

Binary file

rlgym_learn_algos/util/torch_pydantic.py ADDED Viewed

@@ -0,0 +1,118 @@
+from typing import Annotated, Any
+import torch
+from pydantic import (
+    BaseModel,
+    GetCoreSchemaHandler,
+    GetJsonSchemaHandler,
+    ValidationError,
+)
+from pydantic.json_schema import JsonSchemaValue
+from pydantic_core import core_schema
+dtype_str_regex = "|".join(
+    set(
+        f"({str(v)[6:]})" for v in torch.__dict__.values() if isinstance(v, torch.dtype)
+    )
+)
+device_str_regex = (
+    "("
+    + "|".join(
+        f"({v})"
+        for v in [
+            "cpu",
+            "cuda",
+            "ipu",
+            "xpu",
+            "mkldnn",
+            "opengl",
+            "opencl",
+            "ideep",
+            "hip",
+            "ve",
+            "fpga",
+            "maia",
+            "xla",
+            "lazy",
+            "vulkan",
+            "mps",
+            "meta",
+            "hpu",
+            "mtia",
+            "privateuseone",
+        ]
+    )
+    + ")(:\d+)"
+)
+# Created using the example here: https://docs.pydantic.dev/latest/concepts/types/#handling-third-party-types
+class _TorchDtypePydanticAnnotation:
+    @classmethod
+    def __get_pydantic_core_schema__(
+        cls,
+        _source_type: Any,
+        _handler: GetCoreSchemaHandler,
+    ) -> core_schema.CoreSchema:
+        from_str_schema = core_schema.chain_schema(
+            [
+                core_schema.str_schema(pattern=dtype_str_regex),
+                core_schema.no_info_plain_validator_function(
+                    lambda v: getattr(torch, v)
+                ),
+            ]
+        )
+        return core_schema.json_or_python_schema(
+            json_schema=from_str_schema,
+            python_schema=core_schema.union_schema(
+                [
+                    # check if it's an instance first before doing any further work
+                    core_schema.is_instance_schema(torch.dtype),
+                    from_str_schema,
+                ]
+            ),
+            serialization=core_schema.plain_serializer_function_ser_schema(
+                lambda v: str(v)[6:]
+            ),
+        )
+class _TorchDevicePydanticAnnotation:
+    @classmethod
+    def __get_pydantic_core_schema__(
+        cls,
+        _source_type: Any,
+        _handler: GetCoreSchemaHandler,
+    ) -> core_schema.CoreSchema:
+        from_str_schema = core_schema.chain_schema(
+            [
+                core_schema.str_schema(pattern=device_str_regex),
+                core_schema.no_info_plain_validator_function(lambda v: torch.device(v)),
+            ]
+        )
+        from_int_schema = core_schema.chain_schema(
+            [
+                core_schema.int_schema(ge=0),
+                core_schema.no_info_plain_validator_function(lambda v: torch.device(v)),
+            ]
+        )
+        return core_schema.json_or_python_schema(
+            json_schema=from_str_schema,
+            python_schema=core_schema.union_schema(
+                [
+                    # check if it's an instance first before doing any further work
+                    core_schema.is_instance_schema(torch.dtype),
+                    from_str_schema,
+                    from_int_schema,
+                ]
+            ),
+            serialization=core_schema.plain_serializer_function_ser_schema(
+                lambda v: str(v)
+            ),
+        )
+PydanticTorchDtype = Annotated[torch.dtype, _TorchDtypePydanticAnnotation]
+PydanticTorchDevice = Annotated[torch.device, _TorchDevicePydanticAnnotation]

{rlgym_learn_algos-0.1.5.dist-info → rlgym_learn_algos-0.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rlgym-learn-algos
-Version: 0.1.5
+Version: 0.2.0
 Classifier: Programming Language :: Rust
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Requires-Dist: pydantic>=2.8.2

{rlgym_learn_algos-0.1.5.dist-info → rlgym_learn_algos-0.2.0.dist-info}/RECORD RENAMED Viewed

@@ -1,36 +1,37 @@
-rlgym_learn_algos-0.1.5.dist-info/METADATA,sha256=nknWtQflpjq-HMgFle-snwH3_eZuT3bgd56ssd_rSkQ,2431
-rlgym_learn_algos-0.1.5.dist-info/WHEEL,sha256=WWWNS_YivL6eU-qhhdTFNNU59V1SfDxFkjCIXFZL9K8,92
-rlgym_learn_algos-0.1.5.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
-rlgym_learn_algos/logging/dict_metrics_logger.py,sha256=qmqr0HSiHpm5rjyxfAdmXOeBSbgP_t36-e-enpOccnE,1991
-rlgym_learn_algos/logging/metrics_logger.py,sha256=45FBH49OcHl5skvG9J9MIFJtAxbFo1TxtEvLWwjttSU,4122
-rlgym_learn_algos/logging/wandb_metrics_logger.py,sha256=funcqZYUarlKND7W79TThFc5d8j-a_CIQwAnOGYD-rs,6518
+rlgym_learn_algos-0.2.0.dist-info/METADATA,sha256=lhDW1yMI9RhHxSWjTN8WFWJ6O_WlgtFsm8NI1e-GLgI,2431
+rlgym_learn_algos-0.2.0.dist-info/WHEEL,sha256=2I8FMXqttZIv4Ceqp7SSjsZUyiuHAyz-otzopz4PdFA,92
+rlgym_learn_algos-0.2.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
+rlgym_learn_algos/__init__.py,sha256=C7cRdL4lZrpk3ge_4_lGAbGodqWJXM56FfgO0keRPAY,207
 rlgym_learn_algos/logging/__init__.py,sha256=ouItskWI4ItuoFdL--rt9YXCt7MasA473lYPhmJnrFA,423
+rlgym_learn_algos/logging/dict_metrics_logger.py,sha256=qmqr0HSiHpm5rjyxfAdmXOeBSbgP_t36-e-enpOccnE,1991
+rlgym_learn_algos/logging/metrics_logger.py,sha256=0l69GSSrxRcPm0xAjvF7yEIis7jGNu70unXu3hnK0XE,4122
+rlgym_learn_algos/logging/wandb_metrics_logger.py,sha256=Kxi8y-nfoh3EI_OqLm4pDS-zhUWEjkS8F4TdD01dr9U,6939
+rlgym_learn_algos/ppo/__init__.py,sha256=o6B8wCRfeyipSNEGJFyB3SHYmxUytaQelX2zsted5cg,1184
 rlgym_learn_algos/ppo/actor.py,sha256=LZevg0kqRrb4PwF05ePK9b1JIBX04YkWjsPs7swZ9JY,1767
 rlgym_learn_algos/ppo/basic_critic.py,sha256=oyyo8x9K6mi2BsbA6_tRy2Av8Pimb35WspJkPpe8XdQ,1022
 rlgym_learn_algos/ppo/continuous_actor.py,sha256=1vdBUw2mQNFNu6A6ZrAztBjd4DmwjGkIIFLboMZ02lc,4417
 rlgym_learn_algos/ppo/critic.py,sha256=RB89WtiN52BEq5QCpGAPrASUnasac-Bpg7B0lM3UXHw,689
 rlgym_learn_algos/ppo/discrete_actor.py,sha256=Nuc3EndIQud3NGrkBIQgy-Z-okhXVrj6p6okSGD1KNY,2620
 rlgym_learn_algos/ppo/env_trajectories.py,sha256=gzQBRkzwZhlZeSvWL50cc8AOgBfsg5zUys0aTJj6aZU,3775
-rlgym_learn_algos/ppo/experience_buffer.py,sha256=0TqIuWUe-La_oeXDyXztsnyr855EAvkMjtDGVrSzkAQ,9488
-rlgym_learn_algos/ppo/experience_buffer_numpy.py,sha256=5PRgkoNbdsp8a6SyxwFug2k5x6zqEVl4ZJFb4aBblCs,4594
-rlgym_learn_algos/ppo/gae_trajectory_processor.py,sha256=ESwN_CUWJapmtoYUnlFNfBByd_7arMFdMGbnjV5HEVE,5467
-rlgym_learn_algos/ppo/gae_trajectory_processor_pure_python.py,sha256=kRZcM29XBarOQib84mzrcIGkwH5N1yU7aqaxRn_YWVQ,7277
+rlgym_learn_algos/ppo/experience_buffer.py,sha256=py7kwhRJFsPx5lyvcUVywLAsu5zbU_0wV_52Fb6Kb_4,11012
+rlgym_learn_algos/ppo/experience_buffer_numpy.py,sha256=Apk4x-pfRnitKJPW6LBZyOPIhgeJs_5EG7BbTCqMwjk,4761
+rlgym_learn_algos/ppo/gae_trajectory_processor.py,sha256=r-o5ajNSTNr5nZxsUc17KMuZR6c4l4NHHTIs2-WbMgE,4956
+rlgym_learn_algos/ppo/gae_trajectory_processor_pure_python.py,sha256=RpyDR6GQ1JXvwtoKkx5V3z3WvU9ElJdzfNtpPiZDaTc,6831
 rlgym_learn_algos/ppo/multi_discrete_actor.py,sha256=zSYeBBirjguSv_wO-peo06hioHiVhZQjnd-NYwJxmag,3127
-rlgym_learn_algos/ppo/ppo_agent_controller.py,sha256=9qe92yTqjAeDp9lLmOFYotNdcvNpip-xeqHSYIyqI08,23610
-rlgym_learn_algos/ppo/ppo_learner.py,sha256=jMYQRsA6WIbC_UeP8YLqguyfrLuGX5s4_4p8zJ9fq6A,12807
+rlgym_learn_algos/ppo/ppo_agent_controller.py,sha256=CH-xpO2mOCwe4iu_n9wPPsqLxEPYcbQhnvcJYs46qgM,23270
+rlgym_learn_algos/ppo/ppo_learner.py,sha256=z14GaL52mx7b20mQsuOLFXlpYVlR0_9Nn5HinImWaLY,15295
 rlgym_learn_algos/ppo/ppo_metrics_logger.py,sha256=niW8xgQLEBCGgTaVyiE_JqsU6RTjV6h-JzM-7c3JT38,2868
 rlgym_learn_algos/ppo/trajectory.py,sha256=IIH_IG8B_HkyxRPf-YsCyF1jQqNjDx752hgzAehG25I,719
-rlgym_learn_algos/ppo/trajectory_processor.py,sha256=AMdZ2OVqPr9uJfgDcdi0_WwqpuPY6lnNMioiyqbAkxs,2039
-rlgym_learn_algos/ppo/__init__.py,sha256=o6B8wCRfeyipSNEGJFyB3SHYmxUytaQelX2zsted5cg,1184
+rlgym_learn_algos/ppo/trajectory_processor.py,sha256=9-JE8hJkOgVJ-R3_9JYjFTUUTlnVq2-U43VP0HiY-sM,2059
 rlgym_learn_algos/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+rlgym_learn_algos/rlgym_learn_algos.cp310-win32.pyd,sha256=GuvLbJ2bJrcgAmafhTIuvnHsi7X71dPwlbZCaPgzhBI,339968
 rlgym_learn_algos/rlgym_learn_algos.pyi,sha256=NwY-sDZWM06TUiKPzxpfH1Td6G6E8TdxtRPgBSh-PPE,1203
+rlgym_learn_algos/stateful_functions/__init__.py,sha256=QS0KYjuzagNkYiYllXQmjoJn14-G7KZawq1Zvwh8alY,236
 rlgym_learn_algos/stateful_functions/batch_reward_type_numpy_converter.py,sha256=1yte5qYyl9LWdClHZ_YsF7R9dJqQeYfINMdgNF_59Gs,767
 rlgym_learn_algos/stateful_functions/numpy_obs_standardizer.py,sha256=OgtwCaxBGTySPMnE5D5VDKpJ0dsTEz9oHc08A96xRao,1604
 rlgym_learn_algos/stateful_functions/obs_standardizer.py,sha256=qPPc3--J_3mpJJ-QHJjta6dbWWBobL7SYdK5MUP-XMw,606
-rlgym_learn_algos/stateful_functions/__init__.py,sha256=QS0KYjuzagNkYiYllXQmjoJn14-G7KZawq1Zvwh8alY,236
+rlgym_learn_algos/util/__init__.py,sha256=VPM6SN4T_625H9t30s9EiLeXiEEWgcyRVHa-LLVNrn4,47
 rlgym_learn_algos/util/running_stats.py,sha256=0tiGFpKtHWzMa1CxM_ueBzd_ryX4bJBriC8MXcSLg8w,4479
 rlgym_learn_algos/util/torch_functions.py,sha256=CTTHzTIi7u1O9HyX0cVJOrnYVbAtnlVs0g1fO9s3ano,3458
-rlgym_learn_algos/util/__init__.py,sha256=VPM6SN4T_625H9t30s9EiLeXiEEWgcyRVHa-LLVNrn4,47
-rlgym_learn_algos/__init__.py,sha256=C7cRdL4lZrpk3ge_4_lGAbGodqWJXM56FfgO0keRPAY,207
-rlgym_learn_algos/rlgym_learn_algos.cp310-win32.pyd,sha256=Z3ibd7sjCsmUOw1ENr_aWX81l86hwqt9St0qdAe18IM,339968
-rlgym_learn_algos-0.1.5.dist-info/RECORD,,
+rlgym_learn_algos/util/torch_pydantic.py,sha256=pgj3I-3q8iW9qtOCv1fgjNkZgA00G_Rdkb4qJPk5gxo,3530
+rlgym_learn_algos-0.2.0.dist-info/RECORD,,

{rlgym_learn_algos-0.1.5.dist-info → rlgym_learn_algos-0.2.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: maturin (1.8.3)
+Generator: maturin (1.8.6)
 Root-Is-Purelib: false
 Tag: cp310-cp310-win32

{rlgym_learn_algos-0.1.5.dist-info → rlgym_learn_algos-0.2.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

rlgym-learn-algos 0.1.5__cp310-cp310-win32.whl → 0.2.0__cp310-cp310-win32.whl

rlgym-learn-algos 0.1.5cp310-cp310-win32.whl → 0.2.0cp310-cp310-win32.whl