PyPI - torchrl-nightly - Versions diffs - 2025.7.15__cp313-cp313-macosx_10_13_universal2.whl → 2025.7.18__cp313-cp313-macosx_10_13_universal2.whl - Mend

torchrl-nightly 2025.7.15__cp313-cp313-macosx_10_13_universal2.whl → 2025.7.18__cp313-cp313-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

torchrl/_torchrl.cpython-313-darwin.so +0 -0
torchrl/collectors/collectors.py +13 -3
torchrl/data/llm/history.py +36 -0
torchrl/data/tensor_specs.py +34 -9
torchrl/envs/transforms/transforms.py +0 -1
torchrl/modules/distributions/discrete.py +1 -1
torchrl/modules/llm/policies/common.py +59 -9
torchrl/modules/llm/policies/transformers_wrapper.py +90 -53
torchrl/modules/llm/policies/vllm_wrapper.py +50 -23
torchrl/objectives/a2c.py +32 -13
torchrl/objectives/ppo.py +50 -32
torchrl/trainers/helpers/losses.py +2 -2
torchrl/version.py +2 -2
{torchrl_nightly-2025.7.15.dist-info → torchrl_nightly-2025.7.18.dist-info}/METADATA +1 -1
{torchrl_nightly-2025.7.15.dist-info → torchrl_nightly-2025.7.18.dist-info}/RECORD +18 -18
{torchrl_nightly-2025.7.15.dist-info → torchrl_nightly-2025.7.18.dist-info}/WHEEL +0 -0
{torchrl_nightly-2025.7.15.dist-info → torchrl_nightly-2025.7.18.dist-info}/licenses/LICENSE +0 -0
{torchrl_nightly-2025.7.15.dist-info → torchrl_nightly-2025.7.18.dist-info}/top_level.txt +0 -0

torchrl/_torchrl.cpython-313-darwin.so CHANGED Viewed

Binary file

torchrl/collectors/collectors.py CHANGED Viewed

@@ -686,6 +686,10 @@ class SyncDataCollector(DataCollectorBase):
                 policy = RandomPolicy(env.full_action_spec)
         elif policy_factory is not None:
             raise TypeError("policy_factory cannot be used with policy argument.")
+        # If the underlying policy has a state_dict, we keep a reference to the policy and
+        # do all policy weight saving/loading through it
+        if hasattr(policy, "state_dict"):
+            self._policy_w_state_dict = policy
         if trust_policy is None:
             trust_policy = isinstance(policy, (RandomPolicy, CudaGraphModule))
@@ -1686,8 +1690,8 @@ class SyncDataCollector(DataCollectorBase):
         else:
             env_state_dict = OrderedDict()
-        if hasattr(self.policy, "state_dict"):
-            policy_state_dict = self.policy.state_dict()
+        if hasattr(self, "_policy_w_state_dict"):
+            policy_state_dict = self._policy_w_state_dict.state_dict()
             state_dict = OrderedDict(
                 policy_state_dict=policy_state_dict,
                 env_state_dict=env_state_dict,
@@ -1711,7 +1715,13 @@ class SyncDataCollector(DataCollectorBase):
         if strict or "env_state_dict" in state_dict:
             self.env.load_state_dict(state_dict["env_state_dict"], **kwargs)
         if strict or "policy_state_dict" in state_dict:
-            self.policy.load_state_dict(state_dict["policy_state_dict"], **kwargs)
+            if not hasattr(self, "_policy_w_state_dict"):
+                raise ValueError(
+                    "Underlying policy does not have state_dict to load policy_state_dict into."
+                )
+            self._policy_w_state_dict.load_state_dict(
+                state_dict["policy_state_dict"], **kwargs
+            )
         self._frames = state_dict["frames"]
         self._iter = state_dict["iter"]

torchrl/data/llm/history.py CHANGED Viewed

@@ -713,6 +713,42 @@ class History(TensorClass["nocast"]):
         | transformers.AutoProcessor  # noqa: F821
         | None = None,
     ) -> History:
+        r"""Inverts a chat template into a History object.
+        Args:
+            text (str | list[str]): The chat template to invert.
+            chat_template_name (str, optional): The name of the chat template to use.
+            tokenizer (transformers.AutoTokenizer | transformers.AutoProcessor, optional): The tokenizer to use.
+        Returns:
+            History: The inverted History object.
+        Examples:
+            >>> from torchrl.data.llm.history import History
+            >>> from transformers import AutoTokenizer
+            >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
+            >>> text = "<|im_start|>system\nYou are a helpful assistant.\n<|im_end|>\n<|im_start|>user\nWrite a python script that gives the capital of France or Germany.\n<|im_end|>\n<|im_start|>assistant\n<think>The capital of France is Paris, the capital of Germany is Berlin.</think>\n<answer><python>\n"
+            >>> history = History.from_text(text, tokenizer=tokenizer)
+            >>> print(history)
+            History(
+                content=NonTensorStack(
+                    ['You are a helpful assistant.', 'Write a python s...,
+                    batch_size=torch.Size([3]),
+                    device=None),
+                is_complete=NonTensorStack(
+                    [True, True, False],
+                    batch_size=torch.Size([3]),
+                    device=None),
+                role=NonTensorStack(
+                    ['system', 'user', 'assistant'],
+                    batch_size=torch.Size([3]),
+                    device=None),
+                tool_calls=None,
+                tool_responses=None,
+                batch_size=torch.Size([3]),
+                device=None,
+                is_shared=False)
+        """
         if chat_template_name is None:
             if chat_template is not None:
                 # TODO: find best match given template

torchrl/data/tensor_specs.py CHANGED Viewed

@@ -4449,12 +4449,18 @@ class Binary(Categorical):
                 f"shape of the {self.__class__.__name__} spec in expand()."
             )
         return self.__class__(
-            n=self.shape[-1], shape=shape, device=self.device, dtype=self.dtype
+            n=self.shape[-1] if len(self.shape) > 0 else None,
+            shape=shape,
+            device=self.device,
+            dtype=self.dtype,
         )
     def _reshape(self, shape):
         return self.__class__(
-            n=self.shape[-1], shape=shape, device=self.device, dtype=self.dtype
+            n=self.shape[-1] if len(self.shape) > 0 else None,
+            shape=shape,
+            device=self.device,
+            dtype=self.dtype,
         )
     def _unflatten(self, dim, sizes):
@@ -4464,7 +4470,10 @@ class Binary(Categorical):
             .shape
         )
         return self.__class__(
-            n=self.shape[-1], shape=shape, device=self.device, dtype=self.dtype
+            n=self.shape[-1] if len(self.shape) > 0 else None,
+            shape=shape,
+            device=self.device,
+            dtype=self.dtype,
         )
     def squeeze(self, dim=None):
@@ -4472,13 +4481,19 @@ class Binary(Categorical):
         if shape is None:
             return self
         return self.__class__(
-            n=self.shape[-1], shape=shape, device=self.device, dtype=self.dtype
+            n=self.shape[-1] if len(self.shape) > 0 else None,
+            shape=shape,
+            device=self.device,
+            dtype=self.dtype,
         )
     def unsqueeze(self, dim: int):
         shape = _unsqueezed_shape(self.shape, dim)
         return self.__class__(
-            n=self.shape[-1], shape=shape, device=self.device, dtype=self.dtype
+            n=self.shape[-1] if len(self.shape) > 0 else None,
+            shape=shape,
+            device=self.device,
+            dtype=self.dtype,
         )
     def unbind(self, dim: int = 0):
@@ -4495,7 +4510,10 @@ class Binary(Categorical):
         shape = tuple(s for i, s in enumerate(self.shape) if i != dim)
         return tuple(
             self.__class__(
-                n=self.shape[-1], shape=shape, device=self.device, dtype=self.dtype
+                n=self.shape[-1] if len(self.shape) > 0 else None,
+                shape=shape,
+                device=self.device,
+                dtype=self.dtype,
             )
             for i in range(self.shape[dim])
         )
@@ -4512,12 +4530,15 @@ class Binary(Categorical):
         if dest_device == self.device and dest_dtype == self.dtype:
             return self
         return self.__class__(
-            n=self.shape[-1], shape=self.shape, device=dest_device, dtype=dest_dtype
+            n=self.shape[-1] if len(self.shape) > 0 else None,
+            shape=self.shape,
+            device=dest_device,
+            dtype=dest_dtype,
         )
     def clone(self) -> Binary:
         return self.__class__(
-            n=self.shape[-1],
+            n=self.shape[-1] if len(self.shape) > 0 else None,
             shape=self.shape,
             device=self.device,
             dtype=self.dtype,
@@ -4528,6 +4549,8 @@ class Binary(Categorical):
         The last dimension of the spec (length n of the binary vector) cannot be indexed.
         """
+        if not len(self.shape):
+            raise ValueError("Cannot index a Binary spec with an empty shape")
         indexed_shape = _shape_indexing(self.shape[:-1], idx)
         return self.__class__(
             n=self.shape[-1],
@@ -5533,8 +5556,10 @@ class Composite(TensorSpec):
         sub_str = [
             indent(f"{k}: {str(item)}", 4 * " ") for k, item in self._specs.items()
         ]
+        if len(sub_str) == 0:
+            return f"{self.__class__.__name__}(device={self._device}, shape={self.shape}, data_cls={self.data_cls})"
         sub_str = ",\n".join(sub_str)
-        return f"Composite(\n{sub_str},\n    device={self._device},\n    shape={self.shape})"
+        return f"{self.__class__.__name__}(\n{sub_str},\n    device={self._device},\n    shape={self.shape},\n    data_cls={self.data_cls})"
     def type_check(
         self,

torchrl/envs/transforms/transforms.py CHANGED Viewed

@@ -1211,7 +1211,6 @@ but got an object of type {type(transform)}."""
         if tensordict is not None:
             # We must avoid modifying the original tensordict so a shallow copy is necessary.
             # We just select the input data and reset signal, which is all we need.
-            self.transform.transform_input_spec(self.base_env.input_spec.unlock_())
             tensordict = tensordict.select(
                 *self.reset_keys, *self.state_spec.keys(True, True), strict=False
             )

torchrl/modules/distributions/discrete.py CHANGED Viewed

@@ -352,7 +352,7 @@ class MaskedCategorical(D.Categorical):
                 logits = self.logits
                 if logits.ndim > 2:
                     # Bring channels in 2nd dim
-                    logits = logits.transpose(-1, 1)
+                    logits = logits.permute(0, -1, *range(1, logits.ndim - 1))
                 original_value_shape = None
                 if logits.ndim == 1 and value.ndim >= 1:
                     if value.ndim >= 2:

torchrl/modules/llm/policies/common.py CHANGED Viewed

@@ -4,12 +4,13 @@
 # LICENSE file in the root directory of this source tree.
 from __future__ import annotations
+import warnings
 import weakref
 from typing import Any, Literal, overload
 import torch
-from tensordict import NestedKey, TensorDictBase
-from tensordict.nn import TensorDictModuleBase, TensorDictSequential
+from tensordict import lazy_stack, NestedKey, TensorDictBase
+from tensordict.nn import TensorDictModuleBase
 from tensordict.tensorclass import TensorClass
 from tensordict.utils import _zip_strict
 from torch import distributions as D
@@ -171,6 +172,39 @@ class ChatHistory(TensorClass["nocast"]):
             step_mdp_static=True,
         )
+    def __post_init__(self):
+        # Check that all history objects have one more batch dimension than the ChatHistory object
+        if self.prompt is not None:
+            if getattr(self.prompt, "batch_dims", None) == self.batch_dims:
+                warnings.warn(
+                    "Prompt history should have one more batch dimension than the ChatHistory object to handle multi-turn conversations, "
+                    f"got {self.prompt.batch_dims} and {self.batch_dims}. "
+                    "The batch dimension of the ChatHistory object will be unsqueezed along the last dimension."
+                )
+                self.prompt = lazy_stack(
+                    [self.prompt], -1
+                )  # equivalent to unsqueeze(-1) but make sure it's a lazy stack
+        if self.response is not None:
+            if getattr(self.response, "batch_dims", None) == self.batch_dims:
+                warnings.warn(
+                    "Response history should have one more batch dimension than the ChatHistory object to handle multi-turn conversations, "
+                    f"got {self.response.batch_dims} and {self.batch_dims}. "
+                    "The batch dimension of the ChatHistory object will be unsqueezed along the last dimension."
+                )
+                self.response = lazy_stack(
+                    [self.response], -1
+                )  # equivalent to unsqueeze(-1) but make sure it's a lazy stack
+        if self.full is not None:
+            if getattr(self.full, "batch_dims", None) == self.batch_dims:
+                warnings.warn(
+                    "Full history should have one more batch dimension than the ChatHistory object to handle multi-turn conversations, "
+                    f"got {self.full.batch_dims} and {self.batch_dims}. "
+                    "The batch dimension of the ChatHistory object will be unsqueezed along the last dimension."
+                )
+                self.full = lazy_stack(
+                    [self.full], -1
+                )  # equivalent to unsqueeze(-1) but make sure it's a lazy stack
 class LogProbs(TensorClass["nocast"]):
     """A log-probability container.
@@ -454,7 +488,7 @@ class LLMWrapperBase(TensorDictModuleBase):
                 "You can create a new version of this wrapper using the `get_new_version` method."
             )
-        td_out = self(tensordict.copy())
+        td_out = self.forward(tensordict.copy(), logits_only=True)
         # Get logits/log-probs
         if as_padded_tensor is None:
@@ -529,7 +563,7 @@ class LLMWrapperBase(TensorDictModuleBase):
                 "get_dist_with_prompt_mask is not implemented for generate=True. "
                 "You can create a new version of this wrapper using the `get_new_version` method."
             )
-        td_out = self(tensordict.copy())
+        td_out = self.forward(tensordict.copy(), logits_only=True)
         # Try to get prompt tokens first
         if self.pad_output:
@@ -640,7 +674,7 @@ class LLMWrapperBase(TensorDictModuleBase):
                 "get_dist_with_assistant_mask is not implemented for generate=True. "
                 "You can create a new version of this wrapper using the `get_new_version` method."
             )
-        td_out = self(tensordict.copy())
+        td_out = self.forward(tensordict.copy(), logits_only=True)
         # Update the tokens key to reflect the tokenized history when querying the log-probs
         tensordict.update(
             td_out,
@@ -709,7 +743,7 @@ class LLMWrapperBase(TensorDictModuleBase):
                 "get_dist_with_attention_mask is not implemented for generate=True. "
                 "You can create a new version of this wrapper using the `get_new_version` method."
             )
-        td_out = self(tensordict.copy())
+        td_out = self.forward(tensordict.copy(), logits_only=True)
         if self.pad_output:
             logits = td_out.get(logits_key)
             attention_mask = td_out.get(attention_mask_key)
@@ -766,7 +800,7 @@ class LLMWrapperBase(TensorDictModuleBase):
                 "get_dist_with_custom_mask is not implemented for generate=True. "
                 "You can create a new version of this wrapper using the `get_new_version` method."
             )
-        td_out = self(tensordict.copy())
+        td_out = self.forward(tensordict.copy(), logits_only=True)
         if self.pad_output:
             logits = td_out.get(logits_key)
         else:
@@ -813,8 +847,24 @@ class LLMWrapperBase(TensorDictModuleBase):
         """
         return self._get_dist_with_attention_mask(tensordict, **kwargs)
-    # Sampling is taken care of by the sub-modules
-    forward = TensorDictSequential.forward
+    def forward(
+        self,
+        tensordict: TensorDictBase,
+        *,
+        tensordict_out: TensorDictBase | None = None,
+        logits_only: bool = False,
+        **kwargs,
+    ) -> TensorDictBase:  # noqa: D417
+        """Forward pass for the LLM policy.
+        Args:
+            tensordict (TensorDictBase): The input tensordict.
+        Keyword Args:
+            tensordict_out (TensorDictBase | None): The output tensordict.
+            logits_only (bool): Whether to return only the logits. Only effective if generate=False. Defaults to `False`.
+        """
+        raise NotImplementedError
     def _check_padded(self, val: torch.Tensor) -> torch.Tensor:
         """Check that a value is a padded tensor."""

torchrl/modules/llm/policies/transformers_wrapper.py CHANGED Viewed

@@ -13,6 +13,7 @@ from typing import Literal
 import torch
 from tensordict import (
     lazy_stack,
+    LazyStackedTensorDict,
     MetaData,
     NonTensorStack,
     set_list_to_stack,
@@ -468,19 +469,32 @@ class TransformersWrapper(LLMWrapperBase):
     def forward(
         self,
         tensordict: TensorDictBase,
+        *,
         tensordict_out: TensorDictBase | None = None,
+        logits_only: bool = False,
         **kwargs,
     ) -> TensorDictBase:
+        tensordict_orig = tensordict
         if not tensordict.ndim:
+            if tensordict_out is not None:
+                raise ValueError(
+                    "tensordict_out must not be provided when tensordict.ndim == 0. If this is needed, "
+                    "please submit an issue on github."
+                )
             # unsqueeze - squeeze the input
-            try:
-                return self(lazy_stack([tensordict])).squeeze(0)
-            except Exception as e:
-                raise RuntimeError(
-                    f"Unsqueeze/squeeze failed. Inputs to {type(self).__name__} should ideally be 1 dimensional."
-                ) from e
+            return self.forward(lazy_stack([tensordict]), logits_only=logits_only)[0]
         elif tensordict.ndim > 1:
-            return self(tensordict.reshape(-1)).view(tensordict.shape)
+            if tensordict_out is not None:
+                raise ValueError(
+                    "tensordict_out must not be provided when tensordict.ndim > 1. If this is needed, "
+                    "please submit an issue on github."
+                )
+            return self.forward(tensordict.reshape(-1), logits_only=logits_only).view(
+                tensordict.shape
+            )
+        if not isinstance(tensordict, LazyStackedTensorDict):
+            tensordict = tensordict.to_lazystack(0)
         _source_device = None
         if self._device:
@@ -517,17 +531,23 @@ class TransformersWrapper(LLMWrapperBase):
             if self.generate:
                 out = self._from_transformers_generate_history(tensordict, cfg, out)
             else:
-                out = self._from_transformers_logprobs_history(tensordict, cfg, out)
+                out = self._from_transformers_logprobs_history(
+                    tensordict, cfg, out, logits_only=logits_only
+                )
         elif self.input_mode == "text":
             if self.generate:
                 out = self._from_transformers_generate_text(tensordict, cfg, out)
             else:
-                out = self._from_transformers_logprobs_text(tensordict, cfg, out)
+                out = self._from_transformers_logprobs_text(
+                    tensordict, cfg, out, logits_only=logits_only
+                )
         elif self.input_mode == "tokens":
             if self.generate:
                 out = self._from_transformers_generate_tokens(tensordict, cfg, out)
             else:
-                out = self._from_transformers_logprobs_tokens(tensordict, cfg, out)
+                out = self._from_transformers_logprobs_tokens(
+                    tensordict, cfg, out, logits_only=logits_only
+                )
         if _source_device:
             out = out.to(_source_device)
@@ -535,7 +555,7 @@ class TransformersWrapper(LLMWrapperBase):
         if tensordict_out is None:
             if self.inplace is True:
                 # The output is the input
-                tensordict_out = tensordict
+                tensordict_out = tensordict_orig
             elif self.inplace is False:
                 # The output is the new structure
                 tensordict_out = out
@@ -690,7 +710,7 @@ class TransformersWrapper(LLMWrapperBase):
         result.set(self.history_key, history_chat)
         return result
-    def _from_transformers_logprobs_history(self, td, cfg, out):
+    def _from_transformers_logprobs_history(self, td, cfg, out, logits_only=False):
         """Compute log-probs from history input."""
         from torchrl.data.llm import History
@@ -731,7 +751,9 @@ class TransformersWrapper(LLMWrapperBase):
             raise ValueError(
                 f"Expected TensorDictBase for history input, got {type(response_tokens)}"
             )
-        result = self._logprobs_from_history_tokens(response_tokens, cfg, out)
+        result = self._logprobs_from_history_tokens(
+            response_tokens, cfg, out, logits_only=logits_only
+        )
         text_result = Text._from_tensordict(result.empty())
         result.set(self.text_key, text_result)
         result[self.text_key, "full"] = text_full
@@ -952,7 +974,9 @@ class TransformersWrapper(LLMWrapperBase):
                 result = result.to(cast)
             return result
-    def _logprobs_from_history_tokens(self, response_tokens, cfg, out):
+    def _logprobs_from_history_tokens(
+        self, response_tokens, cfg, out, logits_only=False
+    ):
         """Compute log-probs from history tokens."""
         pad_val = self.tokenizer.pad_token_id
@@ -996,6 +1020,7 @@ class TransformersWrapper(LLMWrapperBase):
             tokens_full_padded,
             attention_mask_full_padded,
             pad_val,
+            logits_only=logits_only,
         )
         # Build output TensorClass objects
@@ -1051,19 +1076,20 @@ class TransformersWrapper(LLMWrapperBase):
         tokens_obj.padded = MetaData(self.pad_output)
         out.set(self.tokens_key, tokens_obj)
-        log_probs_obj = LogProbs._from_tensordict(
-            TensorDict(batch_size=out.batch_size).to_lazystack(0)
-        )
-        if self.pad_output:
-            log_probs_obj.full = log_probs_full_padded
-        else:
-            log_probs_full_unpadded = _unpad_tensors(
-                log_probs_full_padded, attention_mask_full_padded, as_nested=False
+        if not logits_only:
+            log_probs_obj = LogProbs._from_tensordict(
+                TensorDict(batch_size=out.batch_size).to_lazystack(0)
             )
-            log_probs_obj.full = log_probs_full_unpadded
-        log_probs_obj.response = None
-        log_probs_obj.padded = MetaData(self.pad_output)
-        out.set(self.log_probs_key, log_probs_obj)
+            if self.pad_output:
+                log_probs_obj.full = log_probs_full_padded
+            else:
+                log_probs_full_unpadded = _unpad_tensors(
+                    log_probs_full_padded, attention_mask_full_padded, as_nested=False
+                )
+                log_probs_obj.full = log_probs_full_unpadded
+            log_probs_obj.response = None
+            log_probs_obj.padded = MetaData(self.pad_output)
+            out.set(self.log_probs_key, log_probs_obj)
         # Add logits to output if we're in a get_dist call
         if self._in_get_dist_call:
@@ -1095,7 +1121,7 @@ class TransformersWrapper(LLMWrapperBase):
             raise ValueError(f"Expected list of text for text input, got {type(text)}")
         return self._generate_from_text(text, cfg, out)
-    def _from_transformers_logprobs_text(self, td, cfg, out):
+    def _from_transformers_logprobs_text(self, td, cfg, out, logits_only=False):
         """Compute log-probs from text input."""
         # Validate input
         if self.input_key not in td:
@@ -1168,6 +1194,7 @@ class TransformersWrapper(LLMWrapperBase):
             input_ids_full_padded,
             attention_mask_full_padded,
             self.tokenizer.pad_token_id,
+            logits_only=logits_only,
         )
         # Build output TensorClass objects
@@ -1212,19 +1239,20 @@ class TransformersWrapper(LLMWrapperBase):
         masks_obj.padded = MetaData(self.pad_output)
         out.set(self.masks_key, masks_obj)
-        log_probs_obj = LogProbs._from_tensordict(
-            TensorDict(batch_size=out.batch_size).to_lazystack(0)
-        )
-        if self.pad_output:
-            log_probs_obj.full = log_probs_full_padded
-        else:
-            log_probs_full_unpadded = _unpad_tensors(
-                log_probs_full_padded, attention_mask_full_padded, as_nested=False
+        if not logits_only:
+            log_probs_obj = LogProbs._from_tensordict(
+                TensorDict(batch_size=out.batch_size).to_lazystack(0)
             )
-            log_probs_obj.full = log_probs_full_unpadded
-        log_probs_obj.response = None
-        log_probs_obj.padded = MetaData(self.pad_output)
-        out.set(self.log_probs_key, log_probs_obj)
+            if self.pad_output:
+                log_probs_obj.full = log_probs_full_padded
+            else:
+                log_probs_full_unpadded = _unpad_tensors(
+                    log_probs_full_padded, attention_mask_full_padded, as_nested=False
+                )
+                log_probs_obj.full = log_probs_full_unpadded
+            log_probs_obj.response = None
+            log_probs_obj.padded = MetaData(self.pad_output)
+            out.set(self.log_probs_key, log_probs_obj)
         # Add logits to output if we're in a get_dist call
         if self._in_get_dist_call:
@@ -1416,7 +1444,11 @@ class TransformersWrapper(LLMWrapperBase):
         return out
     def _from_transformers_logprobs_tokens(
-        self, td: TensorDictBase, cfg: dict | None, out: TensorDictBase
+        self,
+        td: TensorDictBase,
+        cfg: dict | None,
+        out: TensorDictBase,
+        logits_only=False,
     ) -> TensorDictBase:
         """Compute log-probs from tokens input."""
         # Validate input
@@ -1470,6 +1502,7 @@ class TransformersWrapper(LLMWrapperBase):
             input_ids_full_padded,
             attention_mask_full_padded,
             self.tokenizer.pad_token_id,
+            logits_only=logits_only,
         )
         # Build output TensorClass objects
@@ -1514,19 +1547,20 @@ class TransformersWrapper(LLMWrapperBase):
         masks_obj.padded = MetaData(self.pad_output)
         out.set(self.masks_key, masks_obj)
-        log_probs_obj = LogProbs._from_tensordict(
-            TensorDict(batch_size=out.batch_size).to_lazystack(0)
-        )
-        if self.pad_output:
-            log_probs_obj.full = log_probs_full_padded
-        else:
-            log_probs_full_unpadded = _unpad_tensors(
-                log_probs_full_padded, attention_mask_full_padded, as_nested=False
+        if not logits_only:
+            log_probs_obj = LogProbs._from_tensordict(
+                TensorDict(batch_size=out.batch_size).to_lazystack(0)
             )
-            log_probs_obj.full = log_probs_full_unpadded
-        log_probs_obj.response = None
-        log_probs_obj.padded = MetaData(self.pad_output)
-        out.set(self.log_probs_key, log_probs_obj)
+            if self.pad_output:
+                log_probs_obj.full = log_probs_full_padded
+            else:
+                log_probs_full_unpadded = _unpad_tensors(
+                    log_probs_full_padded, attention_mask_full_padded, as_nested=False
+                )
+                log_probs_obj.full = log_probs_full_unpadded
+            log_probs_obj.response = None
+            log_probs_obj.padded = MetaData(self.pad_output)
+            out.set(self.log_probs_key, log_probs_obj)
         # Add logits to output if we're in a get_dist call
         if self._in_get_dist_call:
@@ -1567,7 +1601,7 @@ class TransformersWrapper(LLMWrapperBase):
         return log_probs, logits
     def _compute_log_probs_from_model_output(
-        self, model_output, input_ids, attention_mask, pad_val
+        self, model_output, input_ids, attention_mask, pad_val, logits_only=False
     ):
         """Compute log-probs from model output without modifying original tensors.
@@ -1576,6 +1610,7 @@ class TransformersWrapper(LLMWrapperBase):
             input_ids: Original input token ids
             attention_mask: Original attention mask
             pad_val: Padding token value to ignore in loss computation
+            logits_only: Whether to return only the logits.
         Returns:
             tuple: (log_probs, shifted_logits) where log_probs are the computed log probabilities
@@ -1600,6 +1635,8 @@ class TransformersWrapper(LLMWrapperBase):
             raise ValueError(
                 f"The logits shape {shifted_logits.shape} does not match the input ids shape {shifted_input_ids.shape}"
             )
+        if logits_only:
+            return None, shifted_logits
         # Compute log-probs
         td = TensorDict(

torchrl/modules/llm/policies/vllm_wrapper.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import Any, Literal
 import torch
 from tensordict import (
     lazy_stack,
+    LazyStackedTensorDict,
     MetaData,
     NonTensorStack,
     set_list_to_stack,
@@ -500,19 +501,32 @@ class vLLMWrapper(LLMWrapperBase):
     def forward(
         self,
         tensordict: TensorDictBase,
+        *,
         tensordict_out: TensorDictBase | None = None,
+        logits_only: bool = False,
         **kwargs,
     ) -> TensorDictBase:
+        tensordict_orig = tensordict
         if not tensordict.ndim:
+            if tensordict_out is not None:
+                raise ValueError(
+                    "tensordict_out must not be provided when tensordict.ndim == 0. If this is needed, "
+                    "please submit an issue on github."
+                )
             # unsqueeze - squeeze the input
-            try:
-                return self(lazy_stack([tensordict])).squeeze(0)
-            except Exception as e:
-                raise RuntimeError(
-                    f"Unsqueeze/squeeze failed. Inputs to {type(self).__name__} should ideally be 1 dimensional."
-                ) from e
+            return self.forward(lazy_stack([tensordict]), logits_only=logits_only)[0]
         elif tensordict.ndim > 1:
-            return self(tensordict.reshape(-1)).view(tensordict.shape)
+            if tensordict_out is not None:
+                raise ValueError(
+                    "tensordict_out must not be provided when tensordict.ndim > 1. If this is needed, "
+                    "please submit an issue on github."
+                )
+            return self.forward(tensordict.reshape(-1), logits_only=logits_only).view(
+                tensordict.shape
+            )
+        if not isinstance(tensordict, LazyStackedTensorDict):
+            tensordict = tensordict.to_lazystack(0)
         _source_device = None
         if self._device:
@@ -567,7 +581,7 @@ class vLLMWrapper(LLMWrapperBase):
         if tensordict_out is None:
             if self.inplace is True:
                 # The output is the input
-                tensordict_out = tensordict
+                tensordict_out = tensordict_orig
             elif self.inplace is False:
                 # The output is the new structure
                 tensordict_out = out
@@ -1242,12 +1256,14 @@ class vLLMWrapper(LLMWrapperBase):
         generate_kwargs = {"sampling_params": sampling_params}
         args = ()
+        empirical_attention_mask = None
         if tokens_prompt_unpadded is None:
             # TODO: To be on the safe side, we may do this even in the unpadded case since we're not sure
             #  the user passed an unpadded tensor in the first place.
+            empirical_attention_mask = tokens_prompt_padded != self.padding_value
             tokens_prompt_list = self._to_list(
-                tokens_prompt_padded, tokens_prompt_padded != self.padding_value
+                tokens_prompt_padded, empirical_attention_mask
             )
         else:
             tokens_prompt_list = self._to_list(tokens_prompt_unpadded, None)
@@ -1365,6 +1381,22 @@ class vLLMWrapper(LLMWrapperBase):
                         padding_value=self.padding_value,
                         padding_side="right",
                     )
+                    if (
+                        prompt_logprobs_padded.shape[-1]
+                        != tokens_prompt_padded.shape[-1]
+                    ):
+                        tshape = tokens_prompt_padded.shape
+                        oshape = prompt_logprobs_padded.shape
+                        # it could be that the input was padded already - padding again then
+                        prompt_logprobs_padded = torch.cat(
+                            [
+                                prompt_logprobs_padded.new_zeros(
+                                    tshape[:-1] + (tshape[-1] - oshape[-1],)
+                                ),
+                                prompt_logprobs_padded,
+                            ],
+                            -1,
+                        )
                 else:
                     prompt_logprobs_list = request_output_tc.get(
                         "prompt_logprobs",
@@ -1490,26 +1522,21 @@ class vLLMWrapper(LLMWrapperBase):
         request_output_tc = _RequestOutput_tc.from_request_output(tokens_out_stuct)
+        # For unpadded case, extract from each sequence
+        log_probs_full_unpadded = request_output_tc.get("prompt_logprobs", as_list=True)
         # Extract log-probs from prompt_logprobs
         if self.pad_output:
             # For padded case, use all prompt_logprobs
-            log_probs_full_padded = request_output_tc.get(
-                "prompt_logprobs",
-                as_padded_tensor=True,
-                padding_value=0,
-                padding_side="left",
+            if attention_mask_full_padded is not None:
+                attention_mask_full_padded = tokens_full_padded != self.padding_value
+            log_probs_full_padded = torch.zeros_like(
+                tokens_full_padded, dtype=torch.get_default_dtype()
             )
-            # Mask out padding
-            attention_mask_full_padded = tokens_full_padded != self.padding_value
-            log_probs_full_padded = torch.where(
-                attention_mask_full_padded, log_probs_full_padded, 0.0
+            log_probs_full_padded[attention_mask_full_padded] = torch.cat(
+                log_probs_full_unpadded, -1
             )
         else:
-            # For unpadded case, extract from each sequence
-            log_probs_full_unpadded = request_output_tc.get(
-                "prompt_logprobs", as_list=True
-            )
             self._check_not_padded(log_probs_full_unpadded)
         assistant_mask_full_padded = None

torchrl/objectives/a2c.py CHANGED Viewed

@@ -70,7 +70,7 @@ class A2CLoss(LossModule):
             samples will be used to compute this estimate.
             Defaults to ``1``.
         entropy_coeff (:obj:`float`): the weight of the entropy loss. Defaults to `0.01``.
-        critic_coef (:obj:`float`): the weight of the critic loss. Defaults to ``1.0``. If ``None``, the critic
+        critic_coeff (:obj:`float`): the weight of the critic loss. Defaults to ``1.0``. If ``None``, the critic
             loss won't be included and the in-keys will miss the critic inputs.
         loss_critic_type (str): loss function for the value discrepancy.
             Can be one of "l1", "l2" or "smooth_l1". Defaults to ``"smooth_l1"``.
@@ -156,7 +156,7 @@ class A2CLoss(LossModule):
     the expected keyword arguments are:
     ``["action", "next_reward", "next_done", "next_terminated"]`` + in_keys of the actor and critic.
     The return value is a tuple of tensors in the following order:
-    ``["loss_objective"]`` + ``["loss_critic"]`` if critic_coef is not None + ``["entropy", "loss_entropy"]`` if entropy_bonus is True and critic_coef is not None
+    ``["loss_objective"]`` + ``["loss_critic"]`` if critic_coeff is not None + ``["entropy", "loss_entropy"]`` if entropy_bonus is True and critic_coeff is not None
     Examples:
         >>> import torch
@@ -277,8 +277,8 @@ class A2CLoss(LossModule):
         *,
         entropy_bonus: bool = True,
         samples_mc_entropy: int = 1,
-        entropy_coeff: float = 0.01,
-        critic_coef: float = 1.0,
+        entropy_coeff: float | None = None,
+        critic_coeff: float = 1.0,
         loss_critic_type: str = "smooth_l1",
         gamma: float | None = None,
         separate_losses: bool = False,
@@ -291,13 +291,32 @@ class A2CLoss(LossModule):
         clip_value: float | None = None,
         **kwargs,
     ):
+        # Handle deprecated entropy_coef argument
         if "entropy_coef" in kwargs:
+            if entropy_coeff is not None:  # Check if entropy_coeff was explicitly set
+                raise ValueError(
+                    "Cannot specify both 'entropy_coef' and 'entropy_coeff'"
+                )
             warnings.warn(
                 "'entropy_coef' is deprecated and will be removed in torchrl v0.11. Please use 'entropy_coeff' instead.",
                 DeprecationWarning,
             )
             entropy_coeff = kwargs.pop("entropy_coef")
+        # Set default value if None
+        if entropy_coeff is None:
+            entropy_coeff = 0.01
+        # Handle deprecated critic_coef argument
+        if "critic_coef" in kwargs:
+            if critic_coeff != 1.0:  # Check if critic_coeff was explicitly set
+                raise ValueError("Cannot specify both 'critic_coef' and 'critic_coeff'")
+            warnings.warn(
+                "'critic_coef' is deprecated and will be removed in torchrl v0.11. Please use 'critic_coeff' instead.",
+                DeprecationWarning,
+            )
+            critic_coeff = kwargs.pop("critic_coef")
         if actor is not None:
             actor_network = actor
             del actor
@@ -349,12 +368,12 @@ class A2CLoss(LossModule):
         self.register_buffer(
             "entropy_coeff", torch.as_tensor(entropy_coeff, device=device)
         )
-        if critic_coef is not None:
+        if critic_coeff is not None:
             self.register_buffer(
-                "critic_coef", torch.as_tensor(critic_coef, device=device)
+                "critic_coeff", torch.as_tensor(critic_coeff, device=device)
             )
         else:
-            self.critic_coef = None
+            self.critic_coeff = None
         if gamma is not None:
             raise TypeError(_GAMMA_LMBDA_DEPREC_ERROR)
@@ -399,7 +418,7 @@ class A2CLoss(LossModule):
             *self.actor_network.in_keys,
             *[("next", key) for key in self.actor_network.in_keys],
         ]
-        if self.critic_coef is not None:
+        if self.critic_coeff is not None:
             keys.extend(self.critic_network.in_keys)
         return list(set(keys))
@@ -407,7 +426,7 @@ class A2CLoss(LossModule):
     def out_keys(self):
         if self._out_keys is None:
             outs = ["loss_objective"]
-            if self.critic_coef is not None:
+            if self.critic_coeff is not None:
                 outs.append("loss_critic")
             if self.entropy_bonus:
                 outs.append("entropy")
@@ -478,7 +497,7 @@ class A2CLoss(LossModule):
         return log_prob, dist
     def loss_critic(self, tensordict: TensorDictBase) -> tuple[torch.Tensor, float]:
-        """Returns the loss value of the critic, multiplied by ``critic_coef`` if it is not ``None``.
+        """Returns the loss value of the critic, multiplied by ``critic_coeff`` if it is not ``None``.
         Returns the loss and the clip-fraction.
@@ -539,8 +558,8 @@ class A2CLoss(LossModule):
             "target_actor_network_params",
             "target_critic_network_params",
         )
-        if self.critic_coef is not None:
-            return self.critic_coef * loss_value, clip_fraction
+        if self.critic_coeff is not None:
+            return self.critic_coeff * loss_value, clip_fraction
         return loss_value, clip_fraction
     @property
@@ -568,7 +587,7 @@ class A2CLoss(LossModule):
             entropy = self.get_entropy_bonus(dist)
             td_out.set("entropy", entropy.detach().mean())  # for logging
             td_out.set("loss_entropy", -self.entropy_coeff * entropy)
-        if self.critic_coef is not None:
+        if self.critic_coeff is not None:
             loss_critic, value_clip_fraction = self.loss_critic(tensordict)
             td_out.set("loss_critic", loss_critic)
             if value_clip_fraction is not None:

torchrl/objectives/ppo.py CHANGED Viewed

@@ -102,13 +102,13 @@ class PPOLoss(LossModule):
             Defaults to ``1``.
         entropy_coeff: scalar | Mapping[str, scalar], optional): entropy multiplier when computing the total loss.
             * **Scalar**: one value applied to the summed entropy of every action head.
-            * **Mapping** ``{head_name: coef}`` gives an individual coefficient for each action-head's entropy.
+            * **Mapping** ``{head_name: coeff}`` gives an individual coefficient for each action-head's entropy.
             Defaults to ``0.01``.
         log_explained_variance (bool, optional): if ``True``, the explained variance of the critic
             predictions w.r.t. value targets will be computed and logged as ``"explained_variance"``.
             This can help monitor critic quality during training. Best possible score is 1.0, lower values are worse. Defaults to ``True``.
-        critic_coef (scalar, optional): critic loss multiplier when computing the total
-            loss. Defaults to ``1.0``. Set ``critic_coef`` to ``None`` to exclude the value
+        critic_coeff (scalar, optional): critic loss multiplier when computing the total
+            loss. Defaults to ``1.0``. Set ``critic_coeff`` to ``None`` to exclude the value
             loss from the forward outputs.
         loss_critic_type (str, optional): loss function for the value discrepancy.
             Can be one of "l1", "l2" or "smooth_l1". Defaults to ``"smooth_l1"``.
@@ -239,7 +239,7 @@ class PPOLoss(LossModule):
     the expected keyword arguments are:
     ``["action", "sample_log_prob", "next_reward", "next_done", "next_terminated"]`` + in_keys of the actor and value network.
     The return value is a tuple of tensors in the following order:
-    ``["loss_objective"]`` + ``["entropy", "loss_entropy"]`` if entropy_bonus is set + ``"loss_critic"`` if critic_coef is not ``None``.
+    ``["loss_objective"]`` + ``["entropy", "loss_entropy"]`` if entropy_bonus is set + ``"loss_critic"`` if critic_coeff is not ``None``.
     The output keys can also be filtered using :meth:`PPOLoss.select_out_keys` method.
     Examples:
@@ -351,9 +351,9 @@ class PPOLoss(LossModule):
         *,
         entropy_bonus: bool = True,
         samples_mc_entropy: int = 1,
-        entropy_coeff: float | Mapping[str, float] = 0.01,
+        entropy_coeff: float | Mapping[str, float] | None = None,
         log_explained_variance: bool = True,
-        critic_coef: float | None = None,
+        critic_coeff: float | None = None,
         loss_critic_type: str = "smooth_l1",
         normalize_advantage: bool = False,
         normalize_advantage_exclude_dims: tuple[int] = (),
@@ -377,13 +377,23 @@ class PPOLoss(LossModule):
             critic_network = critic
             del critic
-        if critic_coef is None and critic_network is not None:
-            critic_coef = 1.0
-        elif critic_coef in (None, 0) and critic_network is not None:
-            critic_coef = None
+        # Handle deprecated critic_coef argument
+        if "critic_coef" in kwargs:
+            if critic_coeff is not None:
+                raise ValueError("Cannot specify both 'critic_coef' and 'critic_coeff'")
+            warnings.warn(
+                "'critic_coef' is deprecated and will be removed in torchrl v0.11. Please use 'critic_coeff' instead.",
+                DeprecationWarning,
+            )
+            critic_coeff = kwargs.pop("critic_coef")
+        if critic_coeff is None and critic_network is not None:
+            critic_coeff = 1.0
+        elif critic_coeff in (None, 0) and critic_network is not None:
+            critic_coeff = None
         if actor_network is None or (
-            critic_network is None and critic_coef not in (None, 0.0)
+            critic_network is None and critic_coeff not in (None, 0.0)
         ):
             raise TypeError(
                 "Missing positional arguments actor_network or critic_network."
@@ -431,13 +441,21 @@ class PPOLoss(LossModule):
                     torch, "get_default_device", lambda: torch.device("cpu")
                 )()
-        # Handle deprecated entropy_coeff argument
-        if "entropy_coeff" in kwargs:
+        # Handle deprecated entropy_coef argument
+        if "entropy_coef" in kwargs:
+            if entropy_coeff is not None:  # Check if entropy_coeff was explicitly set
+                raise ValueError(
+                    "Cannot specify both 'entropy_coef' and 'entropy_coeff'"
+                )
             warnings.warn(
-                "'entropy_coeff' is deprecated and will be removed in torchrl v0.11. Please use 'entropy_coeff' instead.",
+                "'entropy_coef' is deprecated and will be removed in torchrl v0.11. Please use 'entropy_coeff' instead.",
                 DeprecationWarning,
             )
-            entropy_coeff = kwargs.pop("entropy_coeff")
+            entropy_coeff = kwargs.pop("entropy_coef")
+        # Set default value if None
+        if entropy_coeff is None:
+            entropy_coeff = 0.01
         if isinstance(entropy_coeff, Mapping):
             # Store the mapping for per-head coefficients
@@ -457,13 +475,13 @@ class PPOLoss(LossModule):
             self._entropy_coeff_map = None
         else:
             raise TypeError("entropy_coeff must be a float or a Mapping[str, float]")
-        if critic_coef is not None:
+        if critic_coeff is not None:
             self.register_buffer(
-                "critic_coef", torch.tensor(critic_coef, device=device)
+                "critic_coeff", torch.tensor(critic_coeff, device=device)
             )
         else:
-            self.critic_coef = None
-        self._has_critic = bool(self.critic_coef is not None and self.critic_coef > 0)
+            self.critic_coeff = None
+        self._has_critic = bool(self.critic_coeff is not None and self.critic_coeff > 0)
         self.loss_critic_type = loss_critic_type
         self.normalize_advantage = normalize_advantage
         self.normalize_advantage_exclude_dims = normalize_advantage_exclude_dims
@@ -692,7 +710,7 @@ class PPOLoss(LossModule):
     def loss_critic(
         self, tensordict: TensorDictBase
     ) -> tuple[torch.Tensor | TensorDict, ...]:
-        """Returns the critic loss multiplied by ``critic_coef``, if it is not ``None``."""
+        """Returns the critic loss multiplied by ``critic_coeff``, if it is not ``None``."""
         # TODO: if the advantage is gathered by forward, this introduces an
         # overhead that we could easily reduce.
         if self.separate_losses:
@@ -766,7 +784,7 @@ class PPOLoss(LossModule):
             "target_critic_network_params",
         )
         if self._has_critic:
-            return self.critic_coef * loss_value, clip_fraction, explained_variance
+            return self.critic_coeff * loss_value, clip_fraction, explained_variance
         return loss_value, clip_fraction, explained_variance
     @property
@@ -954,10 +972,10 @@ class ClipPPOLoss(PPOLoss):
             Defaults to ``1``.
         entropy_coeff: (scalar | Mapping[str, scalar], optional): entropy multiplier when computing the total loss.
             * **Scalar**: one value applied to the summed entropy of every action head.
-            * **Mapping** ``{head_name: coef}`` gives an individual coefficient for each action-head's entropy.
+            * **Mapping** ``{head_name: coeff}`` gives an individual coefficient for each action-head's entropy.
             Defaults to ``0.01``.
-        critic_coef (scalar, optional): critic loss multiplier when computing the total
-            loss. Defaults to ``1.0``. Set ``critic_coef`` to ``None`` to exclude the value
+        critic_coeff (scalar, optional): critic loss multiplier when computing the total
+            loss. Defaults to ``1.0``. Set ``critic_coeff`` to ``None`` to exclude the value
             loss from the forward outputs.
         loss_critic_type (str, optional): loss function for the value discrepancy.
             Can be one of "l1", "l2" or "smooth_l1". Defaults to ``"smooth_l1"``.
@@ -1057,8 +1075,8 @@ class ClipPPOLoss(PPOLoss):
         clip_epsilon: float = 0.2,
         entropy_bonus: bool = True,
         samples_mc_entropy: int = 1,
-        entropy_coeff: float | Mapping[str, float] = 0.01,
-        critic_coef: float | None = None,
+        entropy_coeff: float | Mapping[str, float] | None = None,
+        critic_coeff: float | None = None,
         loss_critic_type: str = "smooth_l1",
         normalize_advantage: bool = False,
         normalize_advantage_exclude_dims: tuple[int] = (),
@@ -1079,7 +1097,7 @@ class ClipPPOLoss(PPOLoss):
             entropy_bonus=entropy_bonus,
             samples_mc_entropy=samples_mc_entropy,
             entropy_coeff=entropy_coeff,
-            critic_coef=critic_coef,
+            critic_coeff=critic_coeff,
             loss_critic_type=loss_critic_type,
             normalize_advantage=normalize_advantage,
             normalize_advantage_exclude_dims=normalize_advantage_exclude_dims,
@@ -1247,9 +1265,9 @@ class KLPENPPOLoss(PPOLoss):
             Defaults to ``1``.
         entropy_coeff: scalar | Mapping[str, scalar], optional): entropy multiplier when computing the total loss.
             * **Scalar**: one value applied to the summed entropy of every action head.
-            * **Mapping** ``{head_name: coef}`` gives an individual coefficient for each action-head's entropy.
+            * **Mapping** ``{head_name: coeff}`` gives an individual coefficient for each action-head's entropy.
             Defaults to ``0.01``.
-        critic_coef (scalar, optional): critic loss multiplier when computing the total
+        critic_coeff (scalar, optional): critic loss multiplier when computing the total
             loss. Defaults to ``1.0``.
         loss_critic_type (str, optional): loss function for the value discrepancy.
             Can be one of "l1", "l2" or "smooth_l1". Defaults to ``"smooth_l1"``.
@@ -1351,8 +1369,8 @@ class KLPENPPOLoss(PPOLoss):
         samples_mc_kl: int = 1,
         entropy_bonus: bool = True,
         samples_mc_entropy: int = 1,
-        entropy_coeff: float | Mapping[str, float] = 0.01,
-        critic_coef: float | None = None,
+        entropy_coeff: float | Mapping[str, float] | None = None,
+        critic_coeff: float | None = None,
         loss_critic_type: str = "smooth_l1",
         normalize_advantage: bool = False,
         normalize_advantage_exclude_dims: tuple[int] = (),
@@ -1369,7 +1387,7 @@ class KLPENPPOLoss(PPOLoss):
             entropy_bonus=entropy_bonus,
             samples_mc_entropy=samples_mc_entropy,
             entropy_coeff=entropy_coeff,
-            critic_coef=critic_coef,
+            critic_coeff=critic_coeff,
             loss_critic_type=loss_critic_type,
             normalize_advantage=normalize_advantage,
             normalize_advantage_exclude_dims=normalize_advantage_exclude_dims,

torchrl/trainers/helpers/losses.py CHANGED Viewed

@@ -86,7 +86,7 @@ class A2CLossConfig:
     # Decay factor for return computation. Default=0.99.
     entropy_coeff: float = 1e-3
     # Entropy factor for the A2C loss
-    critic_coef: float = 1.0
+    critic_coeff: float = 1.0
     # Critic factor for the A2C loss
     critic_loss_function: str = "smooth_l1"
     # loss function for the value network. Either one of l1, l2 or smooth_l1 (default).
@@ -112,7 +112,7 @@ class PPOLossConfig:
     # Number of samples to use for a Monte-Carlo estimate if the policy distribution has not closed formula.
     loss_function: str = "smooth_l1"
     # loss function for the value network. Either one of l1, l2 or smooth_l1 (default).
-    critic_coef: float = 1.0
+    critic_coeff: float = 1.0
     # Critic loss multiplier when computing the total loss.
     # ClipPPOLoss parameters:

torchrl/version.py CHANGED Viewed

@@ -1,2 +1,2 @@
-__version__ = '2025.7.15'
-git_version = '77c00b910e6fdd85aa94b4d354390b724af4ec94'
+__version__ = '2025.7.18'
+git_version = '4001d9cb73cea4498b0fdfe420effc58a5a336be'

{torchrl_nightly-2025.7.15.dist-info → torchrl_nightly-2025.7.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torchrl-nightly
-Version: 2025.7.15
+Version: 2025.7.18
 Home-page: https://github.com/pytorch/rl
 Author: torchrl contributors
 Author-email: vmoens@fb.com

{torchrl_nightly-2025.7.15.dist-info → torchrl_nightly-2025.7.18.dist-info}/RECORD RENAMED Viewed

@@ -3,11 +3,11 @@ build_tools/setup_helpers/__init__.py,sha256=7l8TvVqxKezgzKCLuRv20mvGLloprFVZYm8
 build_tools/setup_helpers/extension.py,sha256=4-PDLr-pw40bJnd9SfxnTaSjUyuXU_Tg8yOg69Kl0o4,5914
 torchrl/__init__.py,sha256=mhDBx2UIuBKc0gmi8dVNHokQ6tCbIovruZmyAxcSsy8,2938
 torchrl/_extension.py,sha256=z7wQ8i1iYWYcnygq_j0nq9sT-koY13tfHhTLNbMk17Q,2353
-torchrl/_torchrl.cpython-313-darwin.so,sha256=Oc5ssTpuyTv6h4iaS_pHmxT44bNN96SH0V9beynYRSc,1692464
+torchrl/_torchrl.cpython-313-darwin.so,sha256=xjRDwIjMdFaN3InficNo2b0v9ju7g-ONAgZ0DcGbk38,1692464
 torchrl/_utils.py,sha256=Cw5EG6x5oSZF1iE3YCs1a32VUKp0rTXIs2u67q9zKUI,41078
-torchrl/version.py,sha256=Fow5OPjVvk1yM4tQyBX-t6Un4hGKcYsr4kuvYN_gGPs,83
+torchrl/version.py,sha256=MHs4CxNjQupYI_f84bY7dOAAfPSU9yN6TOyxiS7tS8c,83
 torchrl/collectors/__init__.py,sha256=hJ3JD6shRku0BL6SzJQq44FZ5Q1RGR8LealFyU3FRn4,799
-torchrl/collectors/collectors.py,sha256=WoeR-MAfzcLiy8EHPWQ3uknm_jTWjA9Wi45CODG8NZI,177782
+torchrl/collectors/collectors.py,sha256=HpaW-y0bQOaOql8_7VyEPJ084CulrVwn6iBpGYoHyH4,178287
 torchrl/collectors/utils.py,sha256=MlXrkYuDmV0Em-tVNQiLL32FWgPNDgceYYG_GgpiviA,11320
 torchrl/collectors/weight_update.py,sha256=nSIfs8ALsfggLoC2ylg1oOAqdGku1tt4e-50JCZJBww,21073
 torchrl/collectors/distributed/__init__.py,sha256=_24P0ALFunLhL-ls7EsssGUhJkZ_m3nw7krfMTwPqS0,705
@@ -25,7 +25,7 @@ torchrl/collectors/llm/weight_update/__init__.py,sha256=bKjvD7yZG5VnHgvYc4EmKI1s
 torchrl/collectors/llm/weight_update/vllm.py,sha256=slKUmrIo4eL6R4J1oEnmlP6Q7Zer09p92JU8zbIHFUM,11515
 torchrl/data/__init__.py,sha256=oowsio6ZUOZnJV8JV43xgs17B37XO1yKAYIQPdk8yt0,4819
 torchrl/data/rlhf.py,sha256=JUmdYBWgkN229DwpXuDrhy9ddjduNvU2kyHzHR6MoA0,963
-torchrl/data/tensor_specs.py,sha256=_t6-iobtJClJ50zvo1KzHSaYS5CvL2Ca6x8btlAc3rs,253067
+torchrl/data/tensor_specs.py,sha256=RlMckj6PJo9MQMzneHzbcVe9xUyMB_n7pnSz0jytB9s,253907
 torchrl/data/utils.py,sha256=attuNwzfgjszyp0lJSrV06f2peX3r0qTjRZWEwfl6Yg,12108
 torchrl/data/datasets/__init__.py,sha256=NQpXsHecbZmza8AocX9mkqQQNkdFzeUrMTZoi6hbbU4,733
 torchrl/data/datasets/atari_dqn.py,sha256=3ij6-UGfKev-QJuUEhZEEmn_3yL210CqKJALaFvlc5M,40739
@@ -41,7 +41,7 @@ torchrl/data/datasets/utils.py,sha256=nAFDTlBIPyEoPoJC-Hc_fcOhzE7UZQE4BwKxq15Vhv
 torchrl/data/datasets/vd4rl.py,sha256=z90MqrxKzod8TPGK0uzkC6vw5wQIE4cgrDAC4e72jyk,18262
 torchrl/data/llm/__init__.py,sha256=B4Ekok-w5PMiWcfmAGXaseaN6hWdNOr4WebeLrHfBVQ,975
 torchrl/data/llm/dataset.py,sha256=t-41hAzQcjrdoKwpHIMbcrT7pRcQ7DHl2a1-lr6E7W4,20703
-torchrl/data/llm/history.py,sha256=Tzkwmc37C9vYjVw_x1wblyENNZSV67srBEioO2j4v2c,57857
+torchrl/data/llm/history.py,sha256=l9JSxIO5eLUFwHH5IZkANSrByYa8BGmtxMlNXYf2fbs,59640
 torchrl/data/llm/prompt.py,sha256=bg5LzJfwOq5Ns72KQMciIprMWAmDDinzdopwdopU04c,8380
 torchrl/data/llm/reward.py,sha256=FbPchNXG3smJV9NCbB5Yk4grsCa2Se4KZ_tojVLKWQM,8404
 torchrl/data/llm/topk.py,sha256=mYXCgJS4TuEVLZfTNccQd6kmC858AAh2Ygy0q_K1hlY,8365
@@ -131,7 +131,7 @@ torchrl/envs/transforms/llm.py,sha256=rQDzuut807wvFpSPCm5tynt8-cMKTgVKVjSVu9D99P
 torchrl/envs/transforms/r3m.py,sha256=sdTVLpnxHfzFVo5rO8WnXf2uUg9cr4LBOLBsWaFgGT8,13478
 torchrl/envs/transforms/rb_transforms.py,sha256=6ohnKXHHAEh2Hz3Seaw6eDrcFMu-1IVQrT7RVywh3YQ,7447
 torchrl/envs/transforms/rlhf.py,sha256=lOVXYqQaoDfm4_n77Dxw_wjicBpMtDvavKmBIK2N3lU,628
-torchrl/envs/transforms/transforms.py,sha256=QnPV5R0sDbR9bHJnRSG8JBy6cnMIeKG7vYUQjRVw5a8,482966
+torchrl/envs/transforms/transforms.py,sha256=cDv_NxElzTOW8qQO-2krvOBmlKVGPOKMfqM6XyuLckU,482882
 torchrl/envs/transforms/utils.py,sha256=7ToVFnD4-DkOMtML91g4bqXeY0bZ-gmCaSLxC93oaKM,3264
 torchrl/envs/transforms/vc1.py,sha256=mho5BvdAK-f9hD9t-iah52wT2B06qPmaJO7chrfIOWY,10534
 torchrl/envs/transforms/vecnorm.py,sha256=XahMcWvK3zjOB6EACSZtJ6UMP3yQ2zD9xf87UEB37Eg,34047
@@ -139,7 +139,7 @@ torchrl/envs/transforms/vip.py,sha256=kmygbenw75rEYsKRq4X1hzEH_CRe1406NZZ8Hg2R_V
 torchrl/modules/__init__.py,sha256=XlAO0hulhDQNcKhbu3cFi8KJOHXNiAgmXeTfny0WBqE,4157
 torchrl/modules/distributions/__init__.py,sha256=RDFoYD9IX1FhwXk5R4M8khq42gdTOcVnUnKHfWCTZBQ,1597
 torchrl/modules/distributions/continuous.py,sha256=VPBugDuavJmyZ-RzemyLIFA02UCMLsm-rzBQrKcTlIA,25667
-torchrl/modules/distributions/discrete.py,sha256=czQSNkacxgZcExKONzDRPZjCJPbfAVaS7fC7Igdp708,35555
+torchrl/modules/distributions/discrete.py,sha256=7UE6X8LeTZkaTRFvKNcFSOoug_tOcD_u-FOh-39ZSC4,35581
 torchrl/modules/distributions/truncated_normal.py,sha256=-qM8vwxTzv3VsWphZwcueDQpHQ67IRnkDFKlTDkQQnY,5937
 torchrl/modules/distributions/utils.py,sha256=kXRvNHeKUePIgKgn7DnKqbhQ6ImFGgkFVRxITX2dwNU,7567
 torchrl/modules/llm/__init__.py,sha256=BTkn-8QKp_8sW_NTKP02yoWSJUsX0XL6L9chTJl6epc,737
@@ -147,9 +147,9 @@ torchrl/modules/llm/utils.py,sha256=gf_F-4bEMwkcI3jLQM7ifB7nsjRctGebB5E2c-AznO0,
 torchrl/modules/llm/backends/__init__.py,sha256=WdVy9EdiAfk8i5zFa49TEkRvcUd0L4Un4v6wqWBy8l8,438
 torchrl/modules/llm/backends/vllm.py,sha256=x57Xop1xd5ZShicsh47ZFmz4VpfZ3eCzVx7k0COvpqQ,9387
 torchrl/modules/llm/policies/__init__.py,sha256=nfZ2mcVuucxnY3WCuzIQrTLIf1yEd36k8-AlvwnSa8Y,545
-torchrl/modules/llm/policies/common.py,sha256=zuaw0CVBAuMcd857JkdVWfSaxGFgwDXWOPF8GflqIkw,36379
-torchrl/modules/llm/policies/transformers_wrapper.py,sha256=HTkubIsbEui2hWqAZ3GwsATI2NGmA0kry1nW5RjnEJ0,74326
-torchrl/modules/llm/policies/vllm_wrapper.py,sha256=u0ITRdVI8pNhpRRMy2yXEh9bK_TkYRUOUEzix2m2aR0,78231
+torchrl/modules/llm/policies/common.py,sha256=Kvn1cJQbp1EZtxWpAQ50TzZkwVtLAmryqiBHH2nK_wM,39112
+torchrl/modules/llm/policies/transformers_wrapper.py,sha256=oi-2KALM0pkH-u-Kd6WlnxfH9eGV2GzBqM410ANpPeM,75777
+torchrl/modules/llm/policies/vllm_wrapper.py,sha256=ReBvi2M9IAiwwBAR7GpDLSQhX0aC-dXPnHYb082Q0To,79632
 torchrl/modules/models/__init__.py,sha256=DrOG-7hynjjUh_tc2EqysiUiNMRiDR0WLtZql9TPNcI,1743
 torchrl/modules/models/batchrenorm.py,sha256=TojpTUluIcFdTSemIVRLGtB2O5q54mRHy3vJP6DuI5I,4750
 torchrl/modules/models/decision_transformer.py,sha256=Lttf_wZMNqXbB_vpxMYgEp18gEzOvm3NvMnxQkHkH4M,6604
@@ -176,7 +176,7 @@ torchrl/modules/utils/__init__.py,sha256=KXaF_xEghKSPsNg0JyfxChK6KWHFRy0lwkL2Rip
 torchrl/modules/utils/mappings.py,sha256=VMYrPxDk1ywgl2l_f6HXZaRsVOKcYR7VF5DNkmi3lHk,362
 torchrl/modules/utils/utils.py,sha256=WPfcE-AoemnrP7Ny4FxJ-_LoQsBnX-y77Zb7MnZjXV0,2916
 torchrl/objectives/__init__.py,sha256=pnprzIXA6E9Ph7isYgNLh4SFTU0pxIQg4oUNcaQ6doc,2148
-torchrl/objectives/a2c.py,sha256=K8mWcLVLUnuW5DgPZCS8P9nN1t30Gvw0j-EgcnO-QGE,27895
+torchrl/objectives/a2c.py,sha256=_xdp8D2ErOPyHwpxqPHtUr-EvZw7MqcuhhK9Isnewgo,28791
 torchrl/objectives/common.py,sha256=40inZ0z3bFdQUkXuup3PWP_KmCx1m13cKTksjOp_b6I,28571
 torchrl/objectives/cql.py,sha256=8faIZmA9e65NQ39HAi6torMofr98bkngjtBXm0UbnVM,54925
 torchrl/objectives/crossq.py,sha256=a_vAjET5GG-2U7zZDgMnA0QP1iPCtv2ho6q-XvvLsnc,28858
@@ -188,7 +188,7 @@ torchrl/objectives/dreamer.py,sha256=vIJQN91oPXYnPubDFQpaF5d3fR_WwIYuIVYtoCvw0TY
 torchrl/objectives/functional.py,sha256=ZaglBjEGuOTNGeFA-Ox-ugZVcNegQMUj--KWHDRBmaU,2106
 torchrl/objectives/gail.py,sha256=0m34XmcN-EDk5OfNIo5bKYbKKZfATsYRv4zQe3v2UwA,9576
 torchrl/objectives/iql.py,sha256=1jvlSznWke6NZSwfuYyHVnVBE7Cz3q169GnCRC7iel4,42991
-torchrl/objectives/ppo.py,sha256=x3wJ3k7jVZWPAZCxdk4bgzhoTYukPwTj39Yo6ZgBbCM,75250
+torchrl/objectives/ppo.py,sha256=0soC2aiCOFNM5hCL20-99LX_NZi6XIXDmG2IkGEHSek,76082
 torchrl/objectives/redq.py,sha256=4usM-nG2UWujeL-VEqzf7-uOwRFx6itkKCeitKuJhtw,28507
 torchrl/objectives/reinforce.py,sha256=ySXLp5C-OOUYayqjrf4taQmL8LgRvMgPCgHDsle8JDc,22339
 torchrl/objectives/sac.py,sha256=Oq9Iq90s9KFbnM4KSRUd2onU1JfW6aW80LWGdtO0CY8,63993
@@ -219,12 +219,12 @@ torchrl/trainers/helpers/__init__.py,sha256=HhDB2Ubq2gZodV-hB6xw4ZgCgwaZKUoZgOfV
 torchrl/trainers/helpers/collectors.py,sha256=NjMMvGWEe4TWkVXzx7AlJ_Qa_AxEzMl6EUmEgUzHkoE,18715
 torchrl/trainers/helpers/envs.py,sha256=1yqJZgz7mc5wa58HmSDGpPQINeDHFZB0_KTgwdKm9QE,22084
 torchrl/trainers/helpers/logger.py,sha256=FtuEiLnK4NmxVVNyEEWaoCu3nG7WbNpHP3UYGQRJmgo,1278
-torchrl/trainers/helpers/losses.py,sha256=qH-2YJwMtDAYAPXTTYy3cOPiq4ILC6xTjfnGUU__6vo,5270
+torchrl/trainers/helpers/losses.py,sha256=sHlJqjh02t8cKN73X35Azd_OoWGurohLuviB8Yeo4JQ,5272
 torchrl/trainers/helpers/models.py,sha256=ihTERG2c96E8cS3Tnul6a_ys6iDEEJmHh05p9blQTW8,21807
 torchrl/trainers/helpers/replay_buffer.py,sha256=ZUZHOa0TILyeWJ3iahzTJ6UvMl_0FdxuZfJEja94Bn8,2001
 torchrl/trainers/helpers/trainers.py,sha256=j6B5XA7_FFHMQeOIQwjNcO0CGE_4mZKUC9_jH_iqqh4,12071
-torchrl_nightly-2025.7.15.dist-info/licenses/LICENSE,sha256=xdjS4_xk-IwnLuIFCvTYTl9Y8aXRejqpmke3dGam_nI,1098
-torchrl_nightly-2025.7.15.dist-info/METADATA,sha256=j4RRTr55v80t_WJvysde-14_KWj9VMI3H7eXvuAmbeQ,42990
-torchrl_nightly-2025.7.15.dist-info/WHEEL,sha256=A6iggJuFsuu67bHdjxJADhwSEJmqwgO3xFoNCIwjOxc,115
-torchrl_nightly-2025.7.15.dist-info/top_level.txt,sha256=JeTJ1jV7QJwLcUS1nr21aPn_wb-XlAZ9c-z_EH472JA,20
-torchrl_nightly-2025.7.15.dist-info/RECORD,,
+torchrl_nightly-2025.7.18.dist-info/licenses/LICENSE,sha256=xdjS4_xk-IwnLuIFCvTYTl9Y8aXRejqpmke3dGam_nI,1098
+torchrl_nightly-2025.7.18.dist-info/METADATA,sha256=K_Nmn84sw1xeD28lqIPqdhLjdaFchSMXuG2vjAajTn0,42990
+torchrl_nightly-2025.7.18.dist-info/WHEEL,sha256=A6iggJuFsuu67bHdjxJADhwSEJmqwgO3xFoNCIwjOxc,115
+torchrl_nightly-2025.7.18.dist-info/top_level.txt,sha256=JeTJ1jV7QJwLcUS1nr21aPn_wb-XlAZ9c-z_EH472JA,20
+torchrl_nightly-2025.7.18.dist-info/RECORD,,

{torchrl_nightly-2025.7.15.dist-info → torchrl_nightly-2025.7.18.dist-info}/WHEEL RENAMED Viewed

File without changes

{torchrl_nightly-2025.7.15.dist-info → torchrl_nightly-2025.7.18.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{torchrl_nightly-2025.7.15.dist-info → torchrl_nightly-2025.7.18.dist-info}/top_level.txt RENAMED Viewed

File without changes