PyPI - agilerl - Versions diffs - 2.5.0.dev0__tar.gz → 2.5.0.dev2__tar.gz - Mend

agilerl 2.5.0.dev0tar.gz → 2.5.0.dev2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (355) hide show

{agilerl-2.5.0.dev0 → agilerl-2.5.0.dev2}/.pre-commit-config.yaml RENAMED Viewed

@@ -43,8 +43,9 @@ repos:
       rev: 0.2.3
       hooks:
           - id: yamlfmt
     - repo: https://github.com/astral-sh/uv-pre-commit
       # uv version.
-      rev: 0.10.2
+      rev: 0.10.3
       hooks:
           - id: uv-lock

{agilerl-2.5.0.dev0 → agilerl-2.5.0.dev2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: agilerl
-Version: 2.5.0.dev0
+Version: 2.5.0.dev2
 Summary: AgileRL is a deep reinforcement learning library focused on improving RL development through RLOps.
 Author-email: Nick Ustaran-Anderegg <dev@agilerl.com>
 License-Expression: Apache-2.0
@@ -29,7 +29,7 @@ Requires-Dist: redis~=4.4.4
 Requires-Dist: supersuit~=3.9.0
 Requires-Dist: tensordict~=0.8
 Requires-Dist: termcolor~=1.1.0
-Requires-Dist: torch==2.7.1
+Requires-Dist: torch==2.9.0
 Requires-Dist: tqdm>=4.66.4
 Requires-Dist: wandb~=0.18.0
 Provides-Extra: all
@@ -37,13 +37,13 @@ Requires-Dist: datasets==4.4.1; extra == 'all'
 Requires-Dist: deepspeed~=0.17.1; extra == 'all'
 Requires-Dist: peft~=0.18.0; extra == 'all'
 Requires-Dist: transformers~=4.57.1; extra == 'all'
-Requires-Dist: vllm~=0.10.0; extra == 'all'
+Requires-Dist: vllm==0.13.0; extra == 'all'
 Provides-Extra: llm
 Requires-Dist: datasets==4.4.1; extra == 'llm'
 Requires-Dist: deepspeed~=0.17.1; extra == 'llm'
 Requires-Dist: peft~=0.18.0; extra == 'llm'
 Requires-Dist: transformers~=4.57.1; extra == 'llm'
-Requires-Dist: vllm~=0.10.0; extra == 'llm'
+Requires-Dist: vllm==0.13.0; extra == 'llm'
 Description-Content-Type: text/markdown
 <p align="center">

{agilerl-2.5.0.dev0 → agilerl-2.5.0.dev2}/agilerl/algorithms/core/base.py RENAMED Viewed

@@ -1561,27 +1561,16 @@ class MultiAgentRLAlgorithm(EvolvableAlgorithm, ABC):
                         nan_arr = np.empty(self.action_dims[agent_id])
                         nan_arr[:] = np.nan
                     else:
-                        nan_arr = np.array([[np.nan]])
+                        nan_arr = np.array([np.nan])
                     env_defined_actions[agent_id] = nan_arr
                     val = nan_arr
                 # Handle discrete actions + env not vectorized
                 if isinstance(val, (int, float)):
-                    val = np.array([[val]])
+                    val = np.array([val])
                     env_defined_actions[agent_id] = val
-                # Ensure additional dimension is added in so shapes align for masking
-                if isinstance(val, np.ndarray) and len(val.shape) == 1:
-                    val = (
-                        val[:, np.newaxis]
-                        if isinstance(
-                            self.possible_action_spaces[agent_id],
-                            spaces.Discrete,
-                        )
-                        else val[np.newaxis, :]
-                    )
-                    env_defined_actions[agent_id] = val
                 agent_masks[agent_id] = np.where(
                     np.isnan(env_defined_actions[agent_id]),
                     0,
@@ -1814,6 +1803,12 @@ class MultiAgentRLAlgorithm(EvolvableAlgorithm, ABC):
             for i, agent_id in enumerate(agent_ids):
                 output_dict[agent_id] = group_outputs[group_id][i]
+                if (
+                    isinstance(self.possible_action_spaces[agent_id], spaces.Discrete)
+                    and output_dict[agent_id].shape[-1] == 1
+                ):
+                    output_dict[agent_id] = output_dict[agent_id].squeeze(-1)
         return output_dict
     def sum_shared_rewards(self, rewards: ArrayDict) -> ArrayDict:
@@ -2302,7 +2297,7 @@ class LLMAlgorithm(EvolvableAlgorithm, ABC):
                 None,
                 None,
             )
-        if hasattr(self, "llm"):
+        if hasattr(self, "llm") and self.llm is not None:
             del self.llm.llm_engine.model_executor
             del self.llm
         gc.collect()

{agilerl-2.5.0.dev0 → agilerl-2.5.0.dev2}/agilerl/algorithms/ippo.py RENAMED Viewed

@@ -27,6 +27,7 @@ from agilerl.typing import (
     TorchObsType,
 )
 from agilerl.utils.algo_utils import (
+    apply_env_defined_actions,
     concatenate_experiences_into_batches,
     concatenate_tensors,
     get_experiences_samples,
@@ -601,10 +602,15 @@ class IPPO(MultiAgentRLAlgorithm):
         # If using env_defined_actions replace actions
         if env_defined_actions is not None:
-            for agent_id in unique_agents_ids:
-                action_dict[agent_id][agent_masks[agent_id]] = env_defined_actions[
-                    agent_id
-                ][agent_masks[agent_id]]
+            action_dict = apply_env_defined_actions(
+                unique_agents_ids,
+                action_dict,
+                env_defined_actions,
+                agent_masks,
+                discrete_actions=isinstance(
+                    next(iter(self.action_space.values())), spaces.Discrete
+                ),
+            )
         return (
             action_dict,

{agilerl-2.5.0.dev0 → agilerl-2.5.0.dev2}/agilerl/algorithms/maddpg.py RENAMED Viewed

@@ -25,6 +25,7 @@ from agilerl.typing import (
     SupportedObsSpaces,
 )
 from agilerl.utils.algo_utils import (
+    apply_env_defined_actions,
     concatenate_spaces,
     format_shared_critic_encoder,
     get_deepest_head_config,
@@ -515,10 +516,15 @@ class MADDPG(MultiAgentRLAlgorithm):
         # If using env_defined_actions replace actions
         if env_defined_actions is not None:
-            for agent in self.agent_ids:
-                processed_action_dict[agent][agent_masks[agent]] = env_defined_actions[
-                    agent
-                ][agent_masks[agent]]
+            action_dict = apply_env_defined_actions(
+                self.agent_ids,
+                processed_action_dict,
+                env_defined_actions,
+                agent_masks,
+                discrete_actions=isinstance(
+                    next(iter(self.action_space.values())), spaces.Discrete
+                ),
+            )
         return processed_action_dict, action_dict

{agilerl-2.5.0.dev0 → agilerl-2.5.0.dev2}/agilerl/algorithms/matd3.py RENAMED Viewed

@@ -24,6 +24,7 @@ from agilerl.typing import (
     StandardTensorDict,
 )
 from agilerl.utils.algo_utils import (
+    apply_env_defined_actions,
     concatenate_spaces,
     format_shared_critic_encoder,
     get_deepest_head_config,
@@ -575,10 +576,15 @@ class MATD3(MultiAgentRLAlgorithm):
         # If using env_defined_actions replace actions
         if env_defined_actions is not None:
-            for agent in self.agent_ids:
-                processed_action_dict[agent][agent_masks[agent]] = env_defined_actions[
-                    agent
-                ][agent_masks[agent]]
+            action_dict = apply_env_defined_actions(
+                self.agent_ids,
+                processed_action_dict,
+                env_defined_actions,
+                agent_masks,
+                discrete_actions=isinstance(
+                    next(iter(self.action_space.values())), spaces.Discrete
+                ),
+            )
         return processed_action_dict, action_dict

{agilerl-2.5.0.dev0 → agilerl-2.5.0.dev2}/agilerl/networks/actors.py RENAMED Viewed

@@ -252,9 +252,6 @@ class StochasticActor(EvolvableNetwork):
     :type recurrent: bool
     :param device: Device to use for the network.
     :type device: str
-    :param use_experimental_distribution: Whether to use the experimental distribution implementation, which
-        includes several optimizations related to using torch primitives for statistics calculations. Defaults to False.
-    :type use_experimental_distribution: bool
     :param random_seed: Random seed to use for the network. Defaults to None.
     :type random_seed: int | None
     :param encoder_name: Name of the encoder network.
@@ -284,7 +281,6 @@ class StochasticActor(EvolvableNetwork):
         simba: bool = False,
         recurrent: bool = False,
         device: str = "cpu",
-        use_experimental_distribution: bool = False,
         random_seed: int | None = None,
         encoder_name: str = "encoder",
     ) -> None:
@@ -312,7 +308,6 @@ class StochasticActor(EvolvableNetwork):
         self.action_std_init = action_std_init
         self.squash_output = squash_output
         self.action_space = action_space
-        self.use_experimental_distribution = use_experimental_distribution
         self.output_size = get_output_size_from_space(self.action_space)
         self.build_network_head(head_config)
@@ -332,14 +327,6 @@ class StochasticActor(EvolvableNetwork):
         else:
             self.action_low, self.action_high = None, None
-        # Wrap the network in an EvolvableDistribution
-        if use_experimental_distribution:
-            from agilerl.networks.distributions_experimental import (
-                EvolvableDistribution,
-            )
-        else:
-            from agilerl.networks.distributions import EvolvableDistribution
         self.head_net = EvolvableDistribution(
             action_space=action_space,
             network=self.head_net,

agilerl-2.5.0.dev0/agilerl/networks/distributions_experimental.py → agilerl-2.5.0.dev2/agilerl/networks/distributions.py RENAMED Viewed

@@ -1,20 +1,20 @@
-import math
+from typing import Union
 import numpy as np
 import torch
-import torch.nn.functional as F
 from gymnasium import spaces
 from agilerl.modules.base import EvolvableModule, EvolvableWrapper
 from agilerl.typing import ArrayOrTensor, DeviceType, NetConfigType
-# NOTE: we still import Normal / Bernoulli solely for continuous & binary helpers,
-#       but no Categorical objects are ever instantiated any more.
+from agilerl.utils.torch_utils import (
+    entropy_from_space,
+    log_prob_from_space,
+    sample_from_space,
+)
 def apply_action_mask_discrete(
-    logits: torch.Tensor,
-    mask: torch.Tensor,
+    logits: torch.Tensor, mask: torch.Tensor
 ) -> torch.Tensor:
     """Apply a mask to the logits.
@@ -37,185 +37,74 @@ class TorchDistribution:
     :param action_space: Action space of the environment.
     :type action_space: spaces.Space
     :param logits: Logits.
-    :type logits: torch.Tensor
+    :type logits: torch.Tensor | None
     :param mu: Mean.
-    :type mu: torch.Tensor
+    :type mu: torch.Tensor | None
     :param log_std: Log standard deviation.
-    :type log_std: torch.Tensor
+    :type log_std: torch.Tensor | None
     :param squash_output: Whether to squash the output to the action space.
     :type squash_output: bool
     """
     def __init__(
         self,
         *,
         action_space: spaces.Space,
-        logits: (
-            torch.Tensor | None
-        ) = None,  # for discrete / multidiscrete / multibinary
-        mu: torch.Tensor | None = None,  # for Box
+        logits: torch.Tensor | None = None,
+        mu: torch.Tensor | None = None,
         log_std: torch.Tensor | None = None,
         squash_output: bool = False,
-    ) -> None:
+    ):
         self.action_space = action_space
-        self.logits, self.mu, self.log_std = logits, mu, log_std
+        self.logits = logits
+        self.mu = mu
+        self.log_std = log_std
         self.squash_output = squash_output and isinstance(action_space, spaces.Box)
         self._sampled_action: torch.Tensor | None = None
-    # ------------------------------------------------------------------ #
-    # fast tensor-only primitives                                        #
-    # ------------------------------------------------------------------ #
     def sample(self) -> torch.Tensor:
-        if isinstance(self.action_space, spaces.Discrete):
-            probs = torch.softmax(self.logits, dim=-1)
-            self._sampled_action = torch.multinomial(probs, 1).squeeze(-1)
-            return self._sampled_action
-        if isinstance(self.action_space, spaces.Box):
-            eps = torch.randn_like(self.mu)
-            out = self.mu + torch.exp(self.log_std) * eps
-            if self.squash_output:
-                out = torch.tanh(out)
-            self._sampled_action = out
-            return out
-        # -------- MultiDiscrete --------
-        if isinstance(self.action_space, spaces.MultiDiscrete):
-            actions = []
-            offset = 0
-            for size in self.action_space.nvec:
-                logits_i = self.logits[:, offset : offset + size]
-                probs_i = torch.softmax(logits_i, dim=-1)
-                act_i = torch.multinomial(probs_i, 1).squeeze(-1)
-                actions.append(act_i)
-                offset += size
-            self._sampled_action = torch.stack(actions, dim=-1)
-            return self._sampled_action
-        # -------- MultiBinary --------
-        if isinstance(self.action_space, spaces.MultiBinary):
-            probs = torch.sigmoid(self.logits)
-            self._sampled_action = torch.bernoulli(
-                probs,
-            )  # Ensures float tensor, removed .to(torch.int64)
-            return self._sampled_action
+        """Sample from the distribution for the given action space.
-        msg = "Unsupported action space in fast path."
-        raise NotImplementedError(msg)
+        :return: Sampled action.
+        :rtype: torch.Tensor
+        """
+        self._sampled_action = sample_from_space(
+            self.action_space,
+            logits=self.logits,
+            mu=self.mu,
+            log_std=self.log_std,
+            squash_output=self.squash_output,
+        )
+        return self._sampled_action
     def log_prob(self, action: torch.Tensor) -> torch.Tensor:
-        if isinstance(self.action_space, spaces.Discrete):
-            log_p_all = torch.log_softmax(self.logits, dim=-1)  # Shape (B, N_actions)
-            action_long = action.long()
-            action_indices_for_gather: torch.Tensor
-            if action_long.ndim == log_p_all.ndim - 1:  # action_long is (B,)
-                action_indices_for_gather = action_long.unsqueeze(
-                    -1,
-                )  # Converts to (B,1)
-            elif action_long.ndim == log_p_all.ndim:  # action_long is (B, K)
-                if action_long.shape[-1] == 1:  # action_long is (B,1)
-                    action_indices_for_gather = action_long
-                elif (
-                    action_long.shape == log_p_all.shape
-                    and hasattr(self.action_space, "n")
-                    and action_long.shape[-1] == self.action_space.n
-                ):
-                    # Special handling for test case: action is (B, N_actions) for Discrete(N_actions)
-                    # Use argmax to get the action index.
-                    action_indices_for_gather = torch.argmax(
-                        action_long,
-                        dim=-1,
-                        keepdim=True,
-                    )  # Converts (B, N_actions) to (B,1)
-                else:
-                    msg = (
-                        f"Action shape {action.shape} is not compatible with Discrete space. "
-                        f"Expected (batch_size,), (batch_size, 1), or (batch_size, num_actions) for argmax case. "
-                        f"Logits shape: {log_p_all.shape}. Action space: {self.action_space}"
-                    )
-                    raise ValueError(
-                        msg,
-                    )
-            else:
-                msg = (
-                    f"Action tensor ndim {action.ndim} is not compatible with Discrete space logits ndim {log_p_all.ndim}. "
-                    f"Expected action ndim to be {log_p_all.ndim - 1} or {log_p_all.ndim}."
-                )
-                raise ValueError(
-                    msg,
-                )
-            return log_p_all.gather(-1, action_indices_for_gather).squeeze(-1)
-        if isinstance(self.action_space, spaces.Box):
-            var = torch.exp(2 * self.log_std)
-            return (
-                -0.5
-                * (
-                    ((action - self.mu) ** 2) / var
-                    + 2 * self.log_std
-                    + math.log(2 * math.pi)
-                )
-            ).sum(-1)
-        # -------- MultiDiscrete --------
-        if isinstance(self.action_space, spaces.MultiDiscrete):
-            logps = []
-            offset = 0
-            for idx, size in enumerate(self.action_space.nvec):
-                logits_i = self.logits[:, offset : offset + size]
-                logp_all = torch.log_softmax(logits_i, dim=-1)
-                act_i = action[:, idx].long()
-                logp_i = logp_all.gather(-1, act_i.unsqueeze(-1)).squeeze(-1)
-                logps.append(logp_i)
-                offset += size
-            return torch.stack(logps, dim=-1).sum(-1)
-        # -------- MultiBinary --------
-        if isinstance(self.action_space, spaces.MultiBinary):
-            # log sigma(x) and log (1-sigma(x))
-            log_p1 = -F.softplus(-self.logits)
-            log_p0 = -self.logits + log_p1
-            a = (
-                action.float()
-            )  # Action for MultiBinary is expected to be float (0.0 or 1.0)
-            return (a * log_p1 + (1.0 - a) * log_p0).sum(-1)
+        """Log probability of the action.
-        raise NotImplementedError
+        :param action: Action.
+        :type action: torch.Tensor
+        :return: Log probability of the action.
+        :rtype: torch.Tensor
+        """
+        return log_prob_from_space(
+            self.action_space,
+            action,
+            logits=self.logits,
+            mu=self.mu,
+            log_std=self.log_std,
+        )
     def entropy(self) -> torch.Tensor:
-        if isinstance(self.action_space, spaces.Discrete):
-            p = torch.softmax(self.logits, dim=-1)
-            return -(p * torch.log(p + 1e-8)).sum(-1)
+        """Entropy of the distribution.
-        if isinstance(self.action_space, spaces.Box):
-            return 0.5 * (1 + math.log(2 * math.pi)) * self.mu.size(
-                -1,
-            ) + self.log_std.sum(-1)
-        # -------- MultiDiscrete --------
-        if isinstance(self.action_space, spaces.MultiDiscrete):
-            entropies = []
-            offset = 0
-            for size in self.action_space.nvec:
-                logits_i = self.logits[:, offset : offset + size]
-                p_i = torch.softmax(logits_i, dim=-1)
-                ent_i = -(p_i * torch.log(p_i + 1e-8)).sum(-1)
-                entropies.append(ent_i)
-                offset += size
-            return torch.stack(entropies, dim=-1).sum(-1)
-        # -------- MultiBinary --------
-        if isinstance(self.action_space, spaces.MultiBinary):
-            p = torch.sigmoid(self.logits)
-            return -(p * torch.log(p + 1e-8) + (1 - p) * torch.log(1 - p + 1e-8)).sum(
-                -1,
-            )
-        raise NotImplementedError
+        :return: Entropy of the distribution.
+        :rtype: torch.Tensor
+        """
+        return entropy_from_space(
+            self.action_space,
+            logits=self.logits,
+            mu=self.mu,
+            log_std=self.log_std,
+        )
 class EvolvableDistribution(EvolvableWrapper):
@@ -247,7 +136,7 @@ class EvolvableDistribution(EvolvableWrapper):
         action_std_init: float = 0.0,
         squash_output: bool = False,
         device: DeviceType = "cpu",
-    ) -> None:
+    ):
         super().__init__(network)
         self.action_space = action_space
@@ -263,7 +152,7 @@ class EvolvableDistribution(EvolvableWrapper):
         if isinstance(action_space, spaces.Box):
             self.log_std = torch.nn.Parameter(
                 torch.ones(1, np.prod(action_space.shape), device=device)
-                * action_std_init,
+                * action_std_init
             )
     @property
@@ -286,7 +175,6 @@ class EvolvableDistribution(EvolvableWrapper):
         # Normal distribution for Continuous action spaces
         if isinstance(self.action_space, spaces.Box):
             log_std = self.log_std.expand_as(logits)
-            # Pass mu and log_std directly to TorchDistribution
             return TorchDistribution(
                 action_space=self.action_space,
                 mu=logits,
@@ -295,20 +183,30 @@ class EvolvableDistribution(EvolvableWrapper):
             )
         # Categorical distribution for Discrete action spaces
-        if isinstance(
-            self.action_space,
-            (spaces.Discrete, spaces.MultiDiscrete, spaces.MultiBinary),
-        ):
-            # Pass logits directly to TorchDistribution
+        if isinstance(self.action_space, spaces.Discrete):
             return TorchDistribution(
                 action_space=self.action_space,
                 logits=logits,
-                squash_output=self.squash_output,  # squash_output is ignored for discrete
+                squash_output=self.squash_output,
+            )
+        # List of categorical distributions for MultiDiscrete action spaces
+        if isinstance(self.action_space, spaces.MultiDiscrete):
+            return TorchDistribution(
+                action_space=self.action_space,
+                logits=logits,
+                squash_output=self.squash_output,
+            )
+        # Bernoulli distribution for MultiBinary action spaces
+        if isinstance(self.action_space, spaces.MultiBinary):
+            return TorchDistribution(
+                action_space=self.action_space,
+                logits=logits,
+                squash_output=self.squash_output,
             )
         msg = f"Action space {self.action_space} not supported."
-        raise NotImplementedError(
-            msg,
-        )
+        raise NotImplementedError(msg)
     def log_prob(self, action: torch.Tensor) -> torch.Tensor:
         """Get the log probability of the action.
@@ -322,7 +220,7 @@ class EvolvableDistribution(EvolvableWrapper):
             msg = "Distribution not initialized. Call forward first."
             raise ValueError(msg)
-        # The new TorchDistribution handles squashing correction internally for Box space
+        # Handles squashing correction internally for Box space
         return self.dist.log_prob(action)
     def entropy(self) -> torch.Tensor:
@@ -335,7 +233,7 @@ class EvolvableDistribution(EvolvableWrapper):
             msg = "Distribution not initialized. Call forward first."
             raise ValueError(msg)
-        # The new TorchDistribution returns analytical entropy for supported spaces
+        # Returns analytical entropy for supported spaces
         return self.dist.entropy()
     def apply_mask(self, logits: torch.Tensor, mask: ArrayOrTensor) -> torch.Tensor:
@@ -350,7 +248,7 @@ class EvolvableDistribution(EvolvableWrapper):
         """
         # Convert mask to tensor and reshape to match logits shape
         mask = torch.as_tensor(mask, dtype=torch.bool, device=self.device).view(
-            logits.shape,
+            logits.shape
         )
         if isinstance(self.action_space, spaces.Discrete):
@@ -360,7 +258,7 @@ class EvolvableDistribution(EvolvableWrapper):
                 list(self.action_space.nvec)
                 if isinstance(self.action_space, spaces.MultiDiscrete)
                 else [
-                    self.action_space.n,
+                    self.action_space.n
                 ]  # For MultiBinary, nvec is not present, use n
             )
             # Split mask and logits into separate distributions
@@ -370,12 +268,10 @@ class EvolvableDistribution(EvolvableWrapper):
             # Apply mask to each split
             masked_logits = []
             for split_logits_i, split_mask_i in zip(
-                split_logits,
-                split_masks,
-                strict=False,
-            ):  # Renamed for clarity
+                split_logits, split_masks, strict=False
+            ):
                 masked_logits.append(
-                    apply_action_mask_discrete(split_logits_i, split_mask_i),
+                    apply_action_mask_discrete(split_logits_i, split_mask_i)
                 )
             masked_logits = torch.cat(masked_logits, dim=1)
@@ -383,9 +279,7 @@ class EvolvableDistribution(EvolvableWrapper):
             # This should ideally not be reached if get_distribution handles the space,
             # but keeping for safety.
             msg = f"Action space {self.action_space} not supported for masking."
-            raise NotImplementedError(
-                msg,
-            )
+            raise NotImplementedError(msg)
         return masked_logits
@@ -394,20 +288,19 @@ class EvolvableDistribution(EvolvableWrapper):
         latent: torch.Tensor,
         action_mask: ArrayOrTensor | None = None,
         sample: bool = True,
-    ) -> (
-        tuple[torch.Tensor, torch.Tensor, torch.Tensor]
-        | tuple[None, None, torch.Tensor]
-    ):
+    ) -> Union[
+        tuple[torch.Tensor, torch.Tensor, torch.Tensor], tuple[None, None, torch.Tensor]
+    ]:
         """Forward pass of the network.
         :param latent: Latent space representation.
         :type latent: torch.Tensor
         :param action_mask: Mask to apply to the logits. Defaults to None.
-        :type action_mask: ArrayOrTensor | None
+        :type action_mask: Optional[ArrayOrTensor]
         :param sample: Whether to sample an action or return the mode/mean. Defaults to True.
         :type sample: bool
         :return: Action and log probability of the action.
-        :rtype: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | tuple[None, torch.Tensor, torch.Tensor]
+        :rtype: Union[tuple[torch.Tensor, torch.Tensor, torch.Tensor], tuple[None, torch.Tensor, torch.Tensor]]
         """
         logits = self.wrapped(latent)
@@ -415,7 +308,8 @@ class EvolvableDistribution(EvolvableWrapper):
             if isinstance(action_mask, (np.ndarray, list)):
                 # Attempt to stack if it's a list of arrays or object array, typical for vectorized envs
                 if isinstance(action_mask, list) or (
-                    isinstance(action_mask, np.ndarray) and action_mask.dtype == object
+                    isinstance(action_mask, np.ndarray)
+                    and action_mask.dtype == np.object_
                 ):
                     try:
                         action_mask = np.stack(action_mask)
@@ -428,15 +322,12 @@ class EvolvableDistribution(EvolvableWrapper):
             # Ensure action_mask is a tensor before applying.
             # The view in apply_mask expects a compatible shape or will error.
             action_mask = torch.as_tensor(
-                action_mask,
-                device=self.device,
-                dtype=torch.bool,
+                action_mask, device=self.device, dtype=torch.bool
             )
             logits = self.apply_mask(logits, action_mask)
         # Distribution from logits
-        # get_distribution now creates the new TorchDistribution object
         self.dist = self.get_distribution(logits)
         # Sample action, compute log probability and entropy
@@ -444,16 +335,14 @@ class EvolvableDistribution(EvolvableWrapper):
             action = self.dist.sample()
             log_prob = self.dist.log_prob(action)
         else:
-            action = None  # Mode/mean might be more appropriate if not sampling
-            log_prob = (
-                None  # Log prob of mode/mean typically not used in PPO sample step
-            )
+            action = None
+            log_prob = None
         entropy = self.dist.entropy()
         return action, log_prob, entropy
     def clone(self) -> "EvolvableDistribution":
-        """Clone the distribution.
+        """Clones the distribution.
         :return: Cloned distribution.
         :rtype: EvolvableDistribution

agilerl 2.5.0.dev0__tar.gz → 2.5.0.dev2__tar.gz

agilerl 2.5.0.dev0tar.gz → 2.5.0.dev2tar.gz