PyPI - agilerl - Versions diffs - 2.3.2.dev0__tar.gz → 2.3.3.dev1__tar.gz - Mend

agilerl 2.3.2.dev0tar.gz → 2.3.3.dev1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{agilerl-2.3.2.dev0 → agilerl-2.3.3.dev1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: agilerl
-Version: 2.3.2.dev0
+Version: 2.3.3.dev1
 Summary: AgileRL is a deep reinforcement learning library focused on improving RL development through RLOps.
 License: Apache 2.0
 Author: Nick Ustaran-Anderegg

{agilerl-2.3.2.dev0 → agilerl-2.3.3.dev1}/agilerl/algorithms/core/base.py RENAMED Viewed

@@ -73,6 +73,8 @@ from agilerl.utils.algo_utils import (
     chkpt_attribute_to_device,
     clone_llm,
     create_warmup_cosine_scheduler,
+    get_input_size_from_space,
+    get_output_size_from_space,
     isroutine,
     key_in_nested_dict,
     module_checkpoint_dict,
@@ -84,8 +86,6 @@ from agilerl.utils.evolvable_networks import (
     compile_model,
     config_from_dict,
     get_default_encoder_config,
-    get_input_size_from_space,
-    get_output_size_from_space,
     is_image_space,
     is_vector_space,
 )
@@ -144,6 +144,9 @@ def get_checkpoint_dict(
         attribute_dict.pop("actor", None)
         return attribute_dict
+    if "rollout_buffer" in attribute_dict:
+        attribute_dict.pop("rollout_buffer")
     # Get checkpoint dictionaries for evolvable modules and optimizers
     network_info: Dict[str, Dict[str, Any]] = {"modules": {}, "optimizers": {}}
     for attr in agent.evolvable_attributes():

{agilerl-2.3.2.dev0 → agilerl-2.3.3.dev1}/agilerl/algorithms/cqn.py RENAMED Viewed

@@ -196,7 +196,6 @@ class CQN(RLAlgorithm):
                     ),
                     axis=1,
                 )
         else:
             self.actor.eval()
             with torch.no_grad():

{agilerl-2.3.2.dev0 → agilerl-2.3.3.dev1}/agilerl/algorithms/ippo.py RENAMED Viewed

@@ -722,7 +722,9 @@ class IPPO(MultiAgentRLAlgorithm):
             returns = advantages + values
         states = concatenate_experiences_into_batches(states, obs_space)
-        actions = concatenate_experiences_into_batches(actions, action_space)
+        actions = concatenate_experiences_into_batches(
+            actions, action_space, actions=True
+        )
         log_probs = log_probs.reshape((-1,))
         experiences = (states, actions, log_probs, advantages, returns, values)

{agilerl-2.3.2.dev0 → agilerl-2.3.3.dev1}/agilerl/algorithms/ppo.py RENAMED Viewed

@@ -464,20 +464,27 @@ class PPO(RLAlgorithm):
         self,
         obs: ArrayOrTensor,
         actions: ArrayOrTensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        hidden_state: Optional[Dict[str, ArrayOrTensor]] = None,
+    ) -> Tuple[
+        torch.Tensor, torch.Tensor, torch.Tensor, Optional[Dict[str, ArrayOrTensor]]
+    ]:
         """Evaluates the actions.
         :param obs: Environment observation, or multiple observations in a batch
         :type obs: ArrayOrTensor
         :param actions: Actions to evaluate
         :type actions: ArrayOrTensor
-        :return: Log probability, entropy, and state values
-        :rtype: Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+        :param hidden_state: Hidden state for recurrent policies, defaults to None. Expected shape: dict with tensors of shape (batch_size, 1, hidden_size).
+        :type hidden_state: Optional[Dict[str, ArrayOrTensor]]
+        :return: Log probability, entropy, state values, and next hidden state
+        :rtype: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[Dict[str, ArrayOrTensor]]]
         """
         obs = self.preprocess_observation(obs)
         # Get values from actor-critic
-        _, _, entropy, values, _ = self._get_action_and_values(obs, sample=False)
+        _, _, entropy, values, next_hidden_state = self._get_action_and_values(
+            obs, hidden_state=hidden_state, sample=False
+        )
         log_prob = self.actor.action_log_prob(actions)
@@ -485,7 +492,7 @@ class PPO(RLAlgorithm):
         if entropy is None:
             entropy = -log_prob.mean()
-        return log_prob, entropy, values
+        return log_prob, entropy, values, next_hidden_state
     def get_action(
         self,
@@ -659,7 +666,7 @@ class PPO(RLAlgorithm):
         num_samples = experiences[4].size(0)
         batch_idxs = np.arange(num_samples)
         mean_loss = 0
-        for epoch in range(self.update_epochs):
+        for _ in range(self.update_epochs):
             np.random.shuffle(batch_idxs)
             for start in range(0, num_samples, self.batch_size):
                 minibatch_idxs = batch_idxs[start : start + self.batch_size]
@@ -679,8 +686,8 @@ class PPO(RLAlgorithm):
                 batch_values = batch_values.squeeze()
                 if len(minibatch_idxs) > 1:
-                    log_prob, entropy, value = self.evaluate_actions(
-                        obs=batch_observations, actions=batch_actions
+                    log_prob, entropy, value, _ = self.evaluate_actions(
+                        obs=batch_observations, actions=batch_actions, hidden_state=None
                     )
                     logratio = log_prob - batch_log_probs
@@ -754,12 +761,8 @@ class PPO(RLAlgorithm):
             warnings.warn("Buffer data is empty. Skipping learning step.")
             return 0.0
-        observations = buffer_td["observations"]
-        advantages = buffer_td["advantages"]
         batch_size = self.batch_size
-        num_samples = observations.size(0)  # Total number of samples in the buffer
+        num_samples = self.rollout_buffer.size()
         indices = np.arange(num_samples)
         mean_loss = 0.0
         approx_kl_divs = []
@@ -775,7 +778,7 @@ class PPO(RLAlgorithm):
                 mb_obs = minibatch_td["observations"]
                 mb_actions = minibatch_td["actions"]
                 mb_log_probs = minibatch_td["log_probs"]
-                mb_advantages = advantages[minibatch_indices]
+                mb_advantages = minibatch_td["advantages"]
                 mb_returns = minibatch_td["returns"]
                 mb_old_values = minibatch_td["values"]
@@ -799,23 +802,19 @@ class PPO(RLAlgorithm):
                             "Recurrent policy, but no hidden_states found in minibatch_td for flat learning."
                         )
-                _, _, entropy, values, _ = self._get_action_and_values(
-                    mb_obs,
-                    hidden_state=eval_hidden_state,
-                    sample=False,  # No sampling during evaluation for loss calculation
-                )
-                log_probs = self.actor.action_log_prob(mb_actions)
+                if isinstance(self.action_space, spaces.Discrete):
+                    mb_actions = mb_actions.squeeze(-1)
-                if entropy is None:  # For continuous squashed actions
-                    entropy = -log_probs
+                log_probs, entropy, values, _ = self.evaluate_actions(
+                    obs=mb_obs, actions=mb_actions, hidden_state=eval_hidden_state
+                )
                 # Normalize advantages
                 mb_advantages = (mb_advantages - mb_advantages.mean()) / (
                     mb_advantages.std() + 1e-8
                 )
-                # Policy loss
+                # Policy los
                 ratio = torch.exp(log_probs - mb_log_probs)
                 policy_loss1 = -mb_advantages * ratio
                 policy_loss2 = -mb_advantages * torch.clamp(
@@ -943,12 +942,10 @@ class PPO(RLAlgorithm):
             warnings.warn("No BPTT sequences to sample. Skipping learning.")
             return 0.0
-        sequences_per_minibatch = (
-            self.batch_size
-        )  # Here, batch_size means number of sequences per minibatch
+        # Here, batch_size means number of sequences per minibatch
+        sequences_per_minibatch = self.batch_size
         mean_loss = 0.0
         total_minibatch_updates_total = 0
         for epoch in range(self.update_epochs):
             approx_kl_divs_epoch = []  # KL divergences for this epoch's minibatches
             np.random.shuffle(all_start_coords)
@@ -982,24 +979,18 @@ class PPO(RLAlgorithm):
                     warnings.warn("Skipping empty or invalid minibatch of sequences.")
                     continue
-                mb_obs_seq = current_minibatch_td[
-                    "observations"
-                ]  # Shape: (batch_seq, seq_len, *obs_dims) or nested TD
-                mb_actions_seq = current_minibatch_td[
-                    "actions"
-                ]  # Shape: (batch_seq, seq_len, *act_dims)
-                mb_old_log_probs_seq = current_minibatch_td[
-                    "log_probs"
-                ]  # Shape: (batch_seq, seq_len)
-                mb_advantages_seq = current_minibatch_td[
-                    "advantages"
-                ]  # Shape: (batch_seq, seq_len) (already normalized)
-                mb_returns_seq = current_minibatch_td[
-                    "returns"
-                ]  # Shape: (batch_seq, seq_len)
-                mb_initial_hidden_states_dict = current_minibatch_td.get_non_tensor(
-                    "initial_hidden_states", default=None
+                # Obs shape: (batch_seq, seq_len, *obs_dims) or nested TD
+                # Actions shape: (batch_seq, seq_len, *act_dims)
+                # Other tensors shape: (batch_seq, seq_len)
+                mb_obs_seq = current_minibatch_td["observations"]
+                mb_actions_seq = current_minibatch_td["actions"]
+                mb_old_log_probs_seq = current_minibatch_td["log_probs"]
+                mb_advantages_seq = current_minibatch_td["advantages"]
+                mb_returns_seq = current_minibatch_td["returns"]
+                mb_initial_hidden_states_dict: Optional[TensorDict] = (
+                    current_minibatch_td.get_non_tensor(
+                        "initial_hidden_states", default=None
+                    )
                 )
                 policy_loss_total, value_loss_total, entropy_loss_total = 0.0, 0.0, 0.0
@@ -1027,26 +1018,19 @@ class PPO(RLAlgorithm):
                     )
                     adv_t, return_t = mb_advantages_seq[:, t], mb_returns_seq[:, t]
+                    # new_value_t: (batch_seq,), entropy_t: (batch_seq,) or scalar, log_prob_t: (batch_seq,)
                     (
-                        _,
-                        _,
+                        new_log_prob_t,
                         entropy_t,
                         new_value_t,
                         next_hidden_state_for_actor_step,
-                    ) = self._get_action_and_values(
-                        obs_t,
+                    ) = self.evaluate_actions(
+                        obs=obs_t,
+                        actions=actions_t,
                         hidden_state=current_step_hidden_state_actor,
-                        sample=False,
-                    )  # new_value_t: (batch_seq,), entropy_t: (batch_seq,) or scalar
-                    new_log_prob_t = self.actor.action_log_prob(
-                        actions_t
-                    )  # Shape: (batch_seq,)
-                    entropy_t = (
-                        (-new_log_prob_t.mean())
-                        if entropy_t is None
-                        else entropy_t.mean()
-                    )  # Ensure scalar
+                    )
+                    if isinstance(entropy_t, torch.Tensor):
+                        entropy_t = entropy_t.mean()
                     ratio = torch.exp(new_log_prob_t - old_log_prob_t)
                     policy_loss1 = -adv_t * ratio

{agilerl-2.3.2.dev0 → agilerl-2.3.3.dev1}/agilerl/components/rollout_buffer.py RENAMED Viewed

@@ -1,34 +1,15 @@
 import random  # Added to support random sequence sampling for BPTT
 import warnings
+from collections import OrderedDict
 from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
 from gymnasium import spaces
-from tensordict import TensorDict  # Add import
-from agilerl.typing import ArrayOrTensor, ObservationType
-# Define the utility function locally to avoid circular import
-def convert_np_to_torch_dtype(np_dtype):
-    """Converts a numpy dtype to a torch dtype."""
-    if np_dtype == np.float32:
-        return torch.float32
-    elif np_dtype == np.float64:
-        return torch.float64
-    elif np_dtype == np.int32:
-        return torch.int32
-    elif np_dtype == np.int64:
-        return torch.int64
-    elif np_dtype == np.uint8:
-        return torch.uint8
-    elif np_dtype == np.bool_:
-        return torch.bool
-    else:
-        # Fallback or raise error for unhandled dtypes
-        warnings.warn(f"Unhandled numpy dtype {np_dtype}, defaulting to torch.float32")
-        return torch.float32
+from tensordict import TensorDict
+from agilerl.typing import ArrayOrTensor, ObservationType, TorchObsType
+from agilerl.utils.algo_utils import get_num_actions, get_obs_shape, maybe_add_batch_dim
 class RolloutBuffer:
@@ -97,69 +78,39 @@ class RolloutBuffer:
         self.full = False
         self._initialize_buffers()
+    def _maybe_reshape_obs(
+        self, obs: TorchObsType, space: spaces.Space
+    ) -> TorchObsType:
+        """Reshape observation to the correct shape.
+        :param obs: Observation to reshape.
+        :type obs: TorchObsType
+        :param space: Observation space.
+        :type space: spaces.Space
+        :return: Reshaped observation.
+        :rtype: TorchObsType
+        """
+        if isinstance(space, spaces.Discrete) and obs.ndim < 2:
+            obs = obs.unsqueeze(-1)
+        return maybe_add_batch_dim(obs, space)
     def _initialize_buffers(self) -> None:
         """Initialize buffer arrays with correct shapes for vectorized environments."""
         # Determine shapes and dtypes for all expected fields
-        if isinstance(self.observation_space, spaces.Discrete):
-            obs_shape = (1,)
-        elif isinstance(self.observation_space, spaces.MultiDiscrete):
-            obs_shape = (len(self.observation_space.nvec),)
-        elif isinstance(self.observation_space, spaces.Box):
-            obs_shape = self.observation_space.shape
-        elif isinstance(self.observation_space, spaces.Dict):
-            # For Dict observation spaces, we'll create a nested structure
-            # The observations will be stored as nested TensorDicts
-            obs_shape = None  # Will be handled as nested TensorDict
-        elif isinstance(self.observation_space, spaces.Tuple):
-            # For Tuple, we'll flatten or handle as multiple entries
-            # For now, let's assume we'll pre-allocate based on flattened structure
-            obs_shape = ()  # Placeholder, will be determined by actual data
-        else:
-            obs_shape = self.observation_space.shape
-        if isinstance(self.action_space, spaces.Discrete):
-            action_shape = ()
-            action_dtype = torch.int64
-        elif isinstance(self.action_space, spaces.Box):
-            action_shape = self.action_space.shape
-            action_dtype = convert_np_to_torch_dtype(
-                self.action_space.dtype
-            )  # Convert numpy dtype to torch dtype
-        elif isinstance(self.action_space, spaces.MultiDiscrete):
-            action_shape = (len(self.action_space.nvec),)
-            action_dtype = torch.int64
-        elif isinstance(self.action_space, spaces.MultiBinary):
-            action_shape = (self.action_space.n,)
-            action_dtype = torch.int64
-        else:
-            try:
-                action_shape = self.action_space.shape
-                action_dtype = convert_np_to_torch_dtype(
-                    getattr(self.action_space, "dtype", np.float32)
-                )  # Convert numpy dtype to torch dtype
-            except AttributeError:
-                raise TypeError(
-                    f"Unsupported action space type without shape: {type(self.action_space)}"
-                )
+        obs_shape = get_obs_shape(self.observation_space)
+        num_actions = get_num_actions(self.action_space)
         # Create a source TensorDict with appropriately sized tensors
         # The tensors will be on the CPU by default, can be moved to device later if needed.
-        source_dict = {}
-        # Handle observations based on space type
-        if isinstance(self.observation_space, spaces.Dict):
-            # For Dict spaces, create nested structure
-            obs_dict = {}
-            for key, subspace in self.observation_space.spaces.items():
-                if isinstance(subspace, spaces.Discrete):
-                    sub_shape = (1,)
-                elif isinstance(subspace, spaces.Box):
-                    sub_shape = subspace.shape
-                else:
-                    sub_shape = subspace.shape if hasattr(subspace, "shape") else ()
+        source_dict = OrderedDict()
+        if isinstance(
+            self.observation_space, spaces.Dict
+        ):  # Nested structure for Dict spaces
+            obs_dict = OrderedDict()
+            for key, shape in obs_shape.items():
                 obs_dict[key] = torch.zeros(
-                    (self.capacity, self.num_envs, *sub_shape), dtype=torch.float32
+                    (self.capacity, self.num_envs, *shape), dtype=torch.float32
                 )
             source_dict["observations"] = obs_dict
@@ -179,7 +130,7 @@ class RolloutBuffer:
         source_dict.update(
             {
                 "actions": torch.zeros(
-                    (self.capacity, self.num_envs, *action_shape), dtype=action_dtype
+                    (self.capacity, self.num_envs, num_actions), dtype=torch.float32
                 ),
                 "rewards": torch.zeros(
                     (self.capacity, self.num_envs), dtype=torch.float32
@@ -283,44 +234,27 @@ class RolloutBuffer:
                 )
         # Prepare data as a dictionary of tensors for the current time step
-        current_step_data = {}
+        current_step_data = OrderedDict()
         # Convert inputs to tensors and ensure correct device (CPU for buffer storage)
         # Also ensure they have the (num_envs, ...) shape
-        # Observations
-        if isinstance(obs, dict):  # Dict observation space
-            obs_dict = {}
+        if isinstance(self.observation_space, spaces.Dict):
+            obs_dict = OrderedDict()
             for key, item in obs.items():
+                sub_space = self.observation_space.spaces[key]
                 obs_tensor = torch.as_tensor(item, device="cpu")
-                if self.num_envs == 1 and obs_tensor.ndim == 0:
-                    obs_tensor = obs_tensor.unsqueeze(0)
-                elif (
-                    self.num_envs == 1
-                    and len(obs_tensor.shape)
-                    < len(self.observation_space.spaces[key].shape) + 1
-                ):
-                    obs_tensor = obs_tensor.unsqueeze(0)
-                obs_dict[key] = obs_tensor
+                obs_dict[key] = self._maybe_reshape_obs(obs_tensor, sub_space)
             current_step_data["observations"] = obs_dict
         else:
             obs_tensor = torch.as_tensor(obs, device="cpu")
-            if (
-                self.num_envs == 1
-                and obs_tensor.ndim < len(self.observation_space.shape) + 1
-            ):  # Add batch dim for single env
-                obs_tensor = obs_tensor.unsqueeze(0)
-            current_step_data["observations"] = obs_tensor
+            current_step_data["observations"] = self._maybe_reshape_obs(
+                obs_tensor, self.observation_space
+            )
         # Actions
         action_tensor = torch.as_tensor(action, device="cpu")
-        if self.num_envs == 1 and action_tensor.ndim < len(self.action_space.shape) + 1:
-            action_tensor = action_tensor.unsqueeze(0)
-        current_step_data["actions"] = action_tensor
+        current_step_data["actions"] = action_tensor.reshape(self.num_envs, -1)
         # Rewards
         reward_tensor = torch.as_tensor(reward, dtype=torch.float32, device="cpu")
@@ -340,28 +274,21 @@ class RolloutBuffer:
         # Next Observations
         if next_obs is not None:
-            if isinstance(next_obs, dict):  # Dict observation space
-                next_obs_dict = {}
+            if isinstance(self.observation_space, spaces.Dict):
+                next_obs_dict = OrderedDict()
                 for key, item in next_obs.items():
+                    sub_space = self.observation_space.spaces[key]
                     next_obs_tensor = torch.as_tensor(item, device="cpu")
-                    if self.num_envs == 1 and next_obs_tensor.ndim == 0:
-                        next_obs_tensor = next_obs_tensor.unsqueeze(0)
-                    elif (
-                        self.num_envs == 1
-                        and len(next_obs_tensor.shape)
-                        < len(self.observation_space.spaces[key].shape) + 1
-                    ):
-                        next_obs_tensor = next_obs_tensor.unsqueeze(0)
-                    next_obs_dict[key] = next_obs_tensor
+                    next_obs_dict[key] = self._maybe_reshape_obs(
+                        next_obs_tensor, sub_space
+                    )
                 current_step_data["next_observations"] = next_obs_dict
             else:
                 next_obs_tensor = torch.as_tensor(next_obs, device="cpu")
-                if (
-                    self.num_envs == 1
-                    and next_obs_tensor.ndim < len(self.observation_space.shape) + 1
-                ):  # Add batch dim
-                    next_obs_tensor = next_obs_tensor.unsqueeze(0)
-                current_step_data["next_observations"] = next_obs_tensor
+                current_step_data["next_observations"] = self._maybe_reshape_obs(
+                    next_obs_tensor, self.observation_space
+                )
         # Episode Starts
         if episode_start is not None:
@@ -493,7 +420,7 @@ class RolloutBuffer:
         # Get a view of the buffer up to the current position and for all envs
         # This slice will have batch_size [buffer_size, num_envs]
-        valid_buffer_data = self.buffer[:buffer_size]
+        valid_buffer_data: TensorDict = self.buffer[:buffer_size]
         # Reshape to flatten the num_envs dimension into the first batch dimension
         # New batch_size will be [buffer_size * num_envs]

{agilerl-2.3.2.dev0 → agilerl-2.3.3.dev1}/agilerl/networks/actors.py RENAMED Viewed

@@ -8,6 +8,7 @@ from agilerl.modules.configs import MlpNetConfig
 from agilerl.networks.base import EvolvableNetwork
 from agilerl.networks.distributions import EvolvableDistribution
 from agilerl.typing import ArrayOrTensor, NetConfigType, TorchObsType
+from agilerl.utils.algo_utils import get_output_size_from_space
 class DeterministicActor(EvolvableNetwork):
@@ -105,6 +106,8 @@ class DeterministicActor(EvolvableNetwork):
         else:
             head_config["output_activation"] = output_activation
+        self.output_size = get_output_size_from_space(self.action_space)
         self.build_network_head(head_config)
         self.output_activation = head_config.get("output_activation", output_activation)
@@ -155,7 +158,7 @@ class DeterministicActor(EvolvableNetwork):
         """
         self.head_net = self.create_mlp(
             num_inputs=self.latent_dim,
-            num_outputs=spaces.flatdim(self.action_space),
+            num_outputs=self.output_size,
             name="actor",
             net_config=net_config,
         )
@@ -188,7 +191,7 @@ class DeterministicActor(EvolvableNetwork):
         head_net = self.create_mlp(
             num_inputs=self.latent_dim,
-            num_outputs=spaces.flatdim(self.action_space),
+            num_outputs=self.output_size,
             name="actor",
             net_config=self.head_net.net_config,
         )
@@ -290,6 +293,7 @@ class StochasticActor(EvolvableNetwork):
         self.squash_output = squash_output
         self.action_space = action_space
         self.use_experimental_distribution = use_experimental_distribution
+        self.output_size = get_output_size_from_space(self.action_space)
         self.build_network_head(head_config)
         self.output_activation = None
@@ -327,7 +331,7 @@ class StochasticActor(EvolvableNetwork):
         """
         self.head_net = self.create_mlp(
             num_inputs=self.latent_dim,
-            num_outputs=spaces.flatdim(self.action_space),
+            num_outputs=self.output_size,
             name="actor",
             net_config=net_config,
         )
@@ -389,7 +393,7 @@ class StochasticActor(EvolvableNetwork):
         head_net = self.create_mlp(
             num_inputs=self.latent_dim,
-            num_outputs=spaces.flatdim(self.action_space),
+            num_outputs=self.output_size,
             name="actor",
             net_config=self.head_net.net_config,
         )

{agilerl-2.3.2.dev0 → agilerl-2.3.3.dev1}/agilerl/networks/distributions.py RENAMED Viewed

@@ -7,6 +7,7 @@ from torch.distributions import Bernoulli, Categorical, Distribution, Normal
 from agilerl.modules.base import EvolvableModule, EvolvableWrapper
 from agilerl.typing import ArrayOrTensor, DeviceType, NetConfigType
+from agilerl.utils.algo_utils import get_output_size_from_space
 DistributionType = Union[Distribution, List[Distribution]]
@@ -328,7 +329,7 @@ class EvolvableDistribution(EvolvableWrapper):
         super().__init__(network)
         self.action_space = action_space
-        self.action_dim = spaces.flatdim(action_space)
+        self.action_dim = get_output_size_from_space(action_space)
         self.action_std_init = action_std_init
         self.device = device
         self.squash_output = squash_output and isinstance(action_space, spaces.Box)

{agilerl-2.3.2.dev0 → agilerl-2.3.3.dev1}/agilerl/rollouts/on_policy.py RENAMED Viewed

@@ -55,9 +55,9 @@ def _collect_rollouts(
     if (
         last_obs is None
-        or last_done is None
-        or last_scores is None
-        or last_info is None
+        and last_done is None
+        and last_scores is None
+        and last_info is None
     ):
         obs, info = env.reset()
         scores = np.zeros(agent.num_envs)
@@ -169,7 +169,6 @@ def _collect_rollouts(
                 scores[idx] = 0
     # Calculate last value to compute returns and advantages properly
-    # TODO: We shouldn't access a hidden method here...
     with torch.no_grad():
         if recurrent:
             _, _, _, last_value, _ = agent._get_action_and_values(

{agilerl-2.3.2.dev0 → agilerl-2.3.3.dev1}/agilerl/training/train_multi_agent_on_policy.py RENAMED Viewed

@@ -14,6 +14,7 @@ from agilerl.algorithms import IPPO
 from agilerl.hpo.mutation import Mutations
 from agilerl.hpo.tournament import TournamentSelection
 from agilerl.networks import StochasticActor
+from agilerl.typing import SingleAgentModule
 from agilerl.utils.algo_utils import obs_channels_to_first
 from agilerl.utils.utils import (
     default_progress_bar,
@@ -192,6 +193,7 @@ def train_multi_agent_on_policy(
         pop_episode_scores = []
         pop_fps = []
         for agent_idx, agent in enumerate(pop):  # Loop through population
+            compiled_agent = agent.torch_compiler is not None
             agent.set_training_mode(True)
             obs, info = env.reset()  # Reset environment at start of episode
@@ -244,7 +246,11 @@ def train_multi_agent_on_policy(
                         )
                         agent_space = agent.possible_action_spaces[agent_id]
                         policy = getattr(agent, agent.registry.policy())
-                        agent_policy = policy[network_id]
+                        agent_policy: SingleAgentModule = policy[network_id]
+                        if compiled_agent:
+                            agent_policy = agent_policy._orig_mod
                         if isinstance(agent_policy, StochasticActor) and isinstance(
                             agent_space, spaces.Box
                         ):

agilerl 2.3.2.dev0__tar.gz → 2.3.3.dev1__tar.gz

agilerl 2.3.2.dev0tar.gz → 2.3.3.dev1tar.gz