PyPI - torchrl-nightly - Versions diffs - 2025.6.19__cp39-cp39-win_amd64.whl → 2025.6.21__cp39-cp39-win_amd64.whl - Mend

torchrl-nightly 2025.6.19__cp39-cp39-win_amd64.whl → 2025.6.21__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

torchrl/_torchrl.cp39-win_amd64.pyd +0 -0
torchrl/collectors/collectors.py +49 -24
torchrl/collectors/llm/base.py +13 -6
torchrl/collectors/llm/ray_collector.py +3 -0
torchrl/data/__init__.py +2 -0
torchrl/data/datasets/minari_data.py +1 -1
torchrl/data/llm/__init__.py +2 -0
torchrl/data/llm/chat.py +59 -9
torchrl/data/llm/topk.py +186 -0
torchrl/data/replay_buffers/ray_buffer.py +15 -1
torchrl/data/replay_buffers/replay_buffers.py +50 -11
torchrl/data/replay_buffers/samplers.py +98 -21
torchrl/data/replay_buffers/storages.py +29 -2
torchrl/envs/llm/__init__.py +2 -0
torchrl/envs/llm/chat.py +4 -1
torchrl/envs/llm/reward/gsm8k.py +15 -8
torchrl/envs/llm/transforms/__init__.py +2 -1
torchrl/envs/llm/transforms/kl.py +240 -4
torchrl/envs/transforms/transforms.py +11 -27
torchrl/modules/llm/policies/transformers_wrapper.py +71 -15
torchrl/modules/llm/policies/vllm_wrapper.py +38 -5
torchrl/objectives/llm/__init__.py +2 -1
torchrl/objectives/llm/sft.py +465 -0
torchrl/objectives/ppo.py +35 -12
torchrl/version.py +2 -2
{torchrl_nightly-2025.6.19.dist-info → torchrl_nightly-2025.6.21.dist-info}/METADATA +1 -1
{torchrl_nightly-2025.6.19.dist-info → torchrl_nightly-2025.6.21.dist-info}/RECORD +30 -28
{torchrl_nightly-2025.6.19.dist-info → torchrl_nightly-2025.6.21.dist-info}/LICENSE +0 -0
{torchrl_nightly-2025.6.19.dist-info → torchrl_nightly-2025.6.21.dist-info}/WHEEL +0 -0
{torchrl_nightly-2025.6.19.dist-info → torchrl_nightly-2025.6.21.dist-info}/top_level.txt +0 -0

torchrl/data/replay_buffers/replay_buffers.py CHANGED Viewed

@@ -702,7 +702,7 @@ class ReplayBuffer:
             self._sampler.add(index)
         return index
-    def _extend(self, data: Sequence) -> torch.Tensor:
+    def _extend(self, data: Sequence, *, update_priority: bool = True) -> torch.Tensor:
         is_comp = is_compiling()
         nc = contextlib.nullcontext()
         with self._replay_lock if not is_comp else nc, self._write_lock if not is_comp else nc:
@@ -712,7 +712,9 @@ class ReplayBuffer:
             self._sampler.extend(index)
         return index
-    def extend(self, data: Sequence) -> torch.Tensor:
+    def extend(
+        self, data: Sequence, *, update_priority: bool | None = None
+    ) -> torch.Tensor:
         """Extends the replay buffer with one or more elements contained in an iterable.
         If present, the inverse transforms will be called.`
@@ -721,6 +723,10 @@ class ReplayBuffer:
             data (iterable): collection of data to be added to the replay
                 buffer.
+        Keyword Args:
+            update_priority (bool, optional): Whether to update the priority of the data. Defaults to True.
+                Without effect in this class. See :meth:`~torchrl.data.TensorDictReplayBuffer.extend` for more details.
         Returns:
             Indices of the data added to the replay buffer.
@@ -735,12 +741,16 @@ class ReplayBuffer:
           unbound elements can be provided (no PyTrees).
         """
+        if update_priority is not None:
+            raise NotImplementedError(
+                "update_priority is not supported in this class. See :meth:`~torchrl.data.TensorDictReplayBuffer.extend` for more details."
+            )
         if self._transform is not None and len(self._transform):
             with _set_dispatch_td_nn_modules(is_tensor_collection(data)):
                 data = self._transform.inv(data)
         if data is None:
             return torch.zeros((0, self._storage.ndim), dtype=torch.long)
-        return self._extend(data)
+        return self._extend(data, update_priority=update_priority)
     def update_priority(
         self,
@@ -914,8 +924,8 @@ class ReplayBuffer:
                 self._iterator = iter(self)
             out = next(self._iterator)
             # if any, we don't want the device ref to be passed in distributed settings
-            if out is not None:
-                out.clear_device_()
+            if out is not None and (out.device != "cpu"):
+                out = out.copy().clear_device_()
             return out
         except StopIteration:
             self._iterator = None
@@ -1015,6 +1025,9 @@ class PrioritizedReplayBuffer(ReplayBuffer):
         storage (Storage, optional): the storage to be used. If none is provided
             a default :class:`~torchrl.data.replay_buffers.ListStorage` with
             ``max_size`` of ``1_000`` will be created.
+        sampler (Sampler, optional): the sampler to be used. If none is provided,
+            a default :class:`~torchrl.data.replay_buffers.PrioritizedSampler` with
+            ``alpha``, ``beta``, and ``eps`` will be created.
         collate_fn (callable, optional): merges a list of samples to form a
             mini-batch of Tensor(s)/outputs.  Used when using batched
             loading from a map-style dataset. The default value will be decided
@@ -1107,6 +1120,7 @@ class PrioritizedReplayBuffer(ReplayBuffer):
         eps: float = 1e-8,
         dtype: torch.dtype = torch.float,
         storage: Storage | None = None,
+        sampler: Sampler | None = None,
         collate_fn: Callable | None = None,
         pin_memory: bool = False,
         prefetch: int | None = None,
@@ -1116,7 +1130,8 @@ class PrioritizedReplayBuffer(ReplayBuffer):
     ) -> None:
         if storage is None:
             storage = ListStorage(max_size=1_000)
-        sampler = PrioritizedSampler(storage.max_size, alpha, beta, eps, dtype)
+        if sampler is None:
+            sampler = PrioritizedSampler(storage.max_size, alpha, beta, eps, dtype)
         super().__init__(
             storage=storage,
             sampler=sampler,
@@ -1347,7 +1362,20 @@ class TensorDictReplayBuffer(ReplayBuffer):
             self.update_tensordict_priority(data)
         return index
-    def extend(self, tensordicts: TensorDictBase) -> torch.Tensor:
+    def extend(
+        self, tensordicts: TensorDictBase, *, update_priority: bool | None = None
+    ) -> torch.Tensor:
+        """Extends the replay buffer with a batch of data.
+        Args:
+            tensordicts (TensorDictBase): The data to extend the replay buffer with.
+        Keyword Args:
+            update_priority (bool, optional): Whether to update the priority of the data. Defaults to True.
+        Returns:
+            The indices of the data that were added to the replay buffer.
+        """
         if not isinstance(tensordicts, TensorDictBase):
             raise ValueError(
                 f"{self.__class__.__name__} only accepts TensorDictBase subclasses. tensorclasses "
@@ -1365,8 +1393,17 @@ class TensorDictReplayBuffer(ReplayBuffer):
         #  is that just doing this results in indices that are not sorted like the original data
         #  so the actually indices will have to be used on the _storage directly (not on the buffer)
         self._set_index_in_td(tensordicts, index)
-        # TODO: in principle this is a good idea but currently it doesn't work + it re-writes a priority that has just been written
-        # self.update_tensordict_priority(tensordicts)
+        if update_priority is None:
+            update_priority = True
+        if update_priority:
+            try:
+                vector = tensordicts.get(self.priority_key)
+                if vector is not None:
+                    self.update_priority(index, vector)
+            except Exception as e:
+                raise RuntimeError(
+                    "Failed to update priority of extended data. You can try to set update_priority=False in the extend method and update the priority manually."
+                ) from e
         return index
     def _set_index_in_td(self, tensordict, index):
@@ -1685,8 +1722,10 @@ class RemoteTensorDictReplayBuffer(TensorDictReplayBuffer):
     def add(self, data: TensorDictBase) -> int:
         return super().add(data)
-    def extend(self, tensordicts: list | TensorDictBase) -> torch.Tensor:
-        return super().extend(tensordicts)
+    def extend(
+        self, tensordicts: list | TensorDictBase, *, update_priority: bool | None = None
+    ) -> torch.Tensor:
+        return super().extend(tensordicts, update_priority=update_priority)
     def update_priority(
         self, index: int | torch.Tensor, priority: int | torch.Tensor

torchrl/data/replay_buffers/samplers.py CHANGED Viewed

@@ -291,17 +291,38 @@ class SamplerWithoutReplacement(Sampler):
 class PrioritizedSampler(Sampler):
-    """Prioritized sampler for replay buffer.
+    r"""Prioritized sampler for replay buffer.
-    Presented in "Schaul, T.; Quan, J.; Antonoglou, I.; and Silver, D. 2015. Prioritized experience replay." (https://arxiv.org/abs/1511.05952)
+    This sampler implements Prioritized Experience Replay (PER) as presented in
+    "Schaul, T.; Quan, J.; Antonoglou, I.; and Silver, D. 2015. Prioritized experience replay."
+    (https://arxiv.org/abs/1511.05952)
+    **Core Idea**: Instead of sampling experiences uniformly from the replay buffer,
+    PER samples experiences with probability proportional to their "importance" - typically
+    measured by the magnitude of their temporal-difference (TD) error. This prioritization
+    can lead to faster learning by focusing on experiences that are most informative.
+    **How it works**:
+    1. Each experience is assigned a priority based on its TD error: :math:`p_i = |\delta_i| + \epsilon`
+    2. Sampling probability is computed as: :math:`P(i) = \frac{p_i^\alpha}{\sum_j p_j^\alpha}`
+    3. Importance sampling weights correct for the bias: :math:`w_i = (N \cdot P(i))^{-\beta}`
     Args:
         max_capacity (int): maximum capacity of the buffer.
-        alpha (:obj:`float`): exponent α determines how much prioritization is used,
-            with α = 0 corresponding to the uniform case.
-        beta (:obj:`float`): importance sampling negative exponent.
-        eps (:obj:`float`, optional): delta added to the priorities to ensure that the buffer
-            does not contain null priorities. Defaults to 1e-8.
+        alpha (:obj:`float`): exponent :math:`\alpha` determines how much prioritization is used.
+            - :math:`\alpha = 0`: uniform sampling (no prioritization)
+            - :math:`\alpha = 1`: full prioritization based on TD error magnitude
+            - Typical values: 0.4-0.7 for balanced prioritization
+            - Higher :math:`\alpha` means more aggressive prioritization of high-error experiences
+        beta (:obj:`float`): importance sampling negative exponent :math:`\beta`.
+            - :math:`\beta` controls the correction for the bias introduced by prioritization
+            - :math:`\beta = 0`: no correction (biased towards high-priority samples)
+            - :math:`\beta = 1`: full correction (unbiased but potentially unstable)
+            - Typical values: start at 0.4-0.6 and anneal to 1.0 during training
+            - Lower :math:`\beta` early in training provides stability, higher :math:`\beta` later reduces bias
+        eps (:obj:`float`, optional): small constant added to priorities to ensure
+            no experience has zero priority. This prevents experiences from never
+            being sampled. Defaults to 1e-8.
         reduction (str, optional): the reduction method for multidimensional
             tensordicts (ie stored trajectory). Can be one of "max", "min",
             "median" or "mean".
@@ -309,6 +330,23 @@ class PrioritizedSampler(Sampler):
             is tracked within the buffer. When ``False``, the max-priority tracks
             the maximum value since the instantiation of the sampler.
+    **Parameter Guidelines**:
+    - **:math:`\alpha` (alpha)**: Controls how much to prioritize high-error experiences
+        - 0.4-0.7: Good balance between learning speed and stability
+        - 1.0: Maximum prioritization (may be unstable)
+        - 0.0: Uniform sampling (no prioritization benefit)
+    - **:math:`\beta` (beta)**: Controls importance sampling correction
+        - Start at 0.4-0.6 for training stability
+        - Anneal to 1.0 over training to reduce bias
+        - Lower values = more stable but biased
+        - Higher values = less biased but potentially unstable
+    - **:math:`\epsilon`**: Small constant to prevent zero priorities
+        - 1e-8: Good default value
+        - Too small: may cause numerical issues
+        - Too large: reduces prioritization effect
     Examples:
         >>> from torchrl.data.replay_buffers import ReplayBuffer, LazyTensorStorage, PrioritizedSampler
         >>> from tensordict import TensorDict
@@ -412,7 +450,7 @@ class PrioritizedSampler(Sampler):
             )
         return super().__getstate__()
-    def _init(self):
+    def _init(self) -> None:
         if self.dtype in (torch.float, torch.FloatType, torch.float32):
             self._sum_tree = SumSegmentTreeFp32(self._max_capacity)
             self._min_tree = MinSegmentTreeFp32(self._max_capacity)
@@ -425,21 +463,23 @@ class PrioritizedSampler(Sampler):
             )
         self._max_priority = None
-    def _empty(self):
+    def _empty(self) -> None:
         self._init()
     @property
-    def _max_priority(self):
+    def _max_priority(self) -> tuple[float | None, int | None]:
         max_priority_index = self.__dict__.get("_max_priority")
         if max_priority_index is None:
             return (None, None)
         return max_priority_index
     @_max_priority.setter
-    def _max_priority(self, value):
+    def _max_priority(self, value: tuple[float | None, int | None]) -> None:
         self.__dict__["_max_priority"] = value
-    def _maybe_erase_max_priority(self, index):
+    def _maybe_erase_max_priority(
+        self, index: torch.Tensor | int | slice | tuple
+    ) -> None:
         if not self._max_priority_within_buffer:
             return
         max_priority_index = self._max_priority[1]
@@ -1839,11 +1879,21 @@ class SliceSamplerWithoutReplacement(SliceSampler, SamplerWithoutReplacement):
 class PrioritizedSliceSampler(SliceSampler, PrioritizedSampler):
-    """Samples slices of data along the first dimension, given start and stop signals, using prioritized sampling.
+    r"""Samples slices of data along the first dimension, given start and stop signals, using prioritized sampling.
+    This class combines trajectory sampling with Prioritized Experience Replay (PER) as presented in
+    "Schaul, T.; Quan, J.; Antonoglou, I.; and Silver, D. 2015. Prioritized experience replay."
+    (https://arxiv.org/abs/1511.05952)
+    **Core Idea**: Instead of sampling trajectory slices uniformly, this sampler prioritizes
+    trajectory start points based on the importance of the transitions at those positions.
+    This allows focusing learning on the most informative parts of trajectories.
-    This class samples sub-trajectories with replacement following a priority weighting presented in "Schaul, T.; Quan, J.; Antonoglou, I.; and Silver, D. 2015.
-        Prioritized experience replay."
-        (https://arxiv.org/abs/1511.05952)
+    **How it works**:
+    1. Each transition is assigned a priority based on its TD error: :math:`p_i = |\\delta_i| + \\epsilon`
+    2. Trajectory start points are sampled with probability: :math:`P(i) = \frac{p_i^\alpha}{\\sum_j p_j^\alpha}`
+    3. Importance sampling weights correct for bias: :math:`w_i = (N \\cdot P(i))^{-\beta}`
+    4. Complete trajectory slices are extracted from the sampled start points
     For more info see :class:`~torchrl.data.replay_buffers.samplers.SliceSampler` and :class:`~torchrl.data.replay_buffers.samplers.PrioritizedSampler`.
@@ -1855,15 +1905,42 @@ class PrioritizedSliceSampler(SliceSampler, PrioritizedSampler):
         :meth:`update_priority`.
     Args:
-        alpha (:obj:`float`): exponent α determines how much prioritization is used,
-            with α = 0 corresponding to the uniform case.
-        beta (:obj:`float`): importance sampling negative exponent.
-        eps (:obj:`float`, optional): delta added to the priorities to ensure that the buffer
-            does not contain null priorities. Defaults to 1e-8.
+        max_capacity (int): maximum capacity of the buffer.
+        alpha (:obj:`float`): exponent :math:`\alpha` determines how much prioritization is used.
+            - :math:`\alpha = 0`: uniform sampling of trajectory start points
+            - :math:`\alpha = 1`: full prioritization based on TD error magnitude at start points
+            - Typical values: 0.4-0.7 for balanced prioritization
+            - Higher :math:`\alpha` means more aggressive prioritization of high-error trajectory regions
+        beta (:obj:`float`): importance sampling negative exponent :math:`\beta`.
+            - :math:`\beta` controls the correction for the bias introduced by prioritization
+            - :math:`\beta = 0`: no correction (biased towards high-priority trajectory regions)
+            - :math:`\beta = 1`: full correction (unbiased but potentially unstable)
+            - Typical values: start at 0.4-0.6 and anneal to 1.0 during training
+            - Lower :math:`\beta` early in training provides stability, higher :math:`\beta` later reduces bias
+        eps (:obj:`float`, optional): small constant added to priorities to ensure
+            no transition has zero priority. This prevents trajectory regions from never
+            being sampled. Defaults to 1e-8.
         reduction (str, optional): the reduction method for multidimensional
             tensordicts (i.e., stored trajectory). Can be one of "max", "min",
             "median" or "mean".
+    **Parameter Guidelines**:
+    - **:math:`\alpha` (alpha)**: Controls how much to prioritize high-error trajectory regions
+        - 0.4-0.7: Good balance between learning speed and stability
+        - 1.0: Maximum prioritization (may be unstable)
+        - 0.0: Uniform sampling (no prioritization benefit)
+    - **:math:`\beta` (beta)**: Controls importance sampling correction
+        - Start at 0.4-0.6 for training stability
+        - Anneal to 1.0 over training to reduce bias
+        - Lower values = more stable but biased
+        - Higher values = less biased but potentially unstable
+    - **:math:`\\epsilon`**: Small constant to prevent zero priorities
+        - 1e-8: Good default value
+        - Too small: may cause numerical issues
+        - Too large: reduces prioritization effect
     Keyword Args:
         num_slices (int): the number of slices to be sampled. The batch-size
             must be greater or equal to the ``num_slices`` argument. Exclusive

torchrl/data/replay_buffers/storages.py CHANGED Viewed

@@ -230,15 +230,38 @@ class ListStorage(Storage):
         max_size (int, optional): the maximum number of elements stored in the storage.
             If not provided, an unlimited storage is created.
+    Keyword Args:
+        compilable (bool, optional): if ``True``, the storage will be made compatible with :func:`~torch.compile` at
+            the cost of being executable in multiprocessed settings.
+        device (str, optional): the device to use for the storage. Defaults to `None` (inputs are not moved to the device).
     """
     _default_checkpointer = ListStorageCheckpointer
-    def __init__(self, max_size: int | None = None, compilable: bool = False):
+    def __init__(
+        self,
+        max_size: int | None = None,
+        *,
+        compilable: bool = False,
+        device: torch.device | str | int | None = None,
+    ):
         if max_size is None:
             max_size = torch.iinfo(torch.int64).max
         super().__init__(max_size, compilable=compilable)
         self._storage = []
+        self.device = device
+    def _to_device(self, data: Any) -> Any:
+        """Utility method to move data to the device."""
+        if self.device is not None:
+            if hasattr(data, "to"):
+                data = data.to(self.device)
+            else:
+                data = tree_map(
+                    lambda x: x.to(self.device) if hasattr(x, "to") else x, data
+                )
+        return data
     def set(
         self,
@@ -254,6 +277,7 @@ class ListStorage(Storage):
                 self.set(int(cursor), data, set_cursor=set_cursor)
                 return
             if isinstance(cursor, slice):
+                data = self._to_device(data)
                 self._storage[cursor] = data
                 return
             if isinstance(
@@ -290,6 +314,7 @@ class ListStorage(Storage):
                     f"maximum capacity is {self.max_size} "
                     f"and the index of the item to be set is {cursor}."
                 )
+            data = self._to_device(data)
             if cursor == len(self._storage):
                 self._storage.append(data)
             else:
@@ -387,6 +412,7 @@ class LazyStackStorage(ListStorage):
         compilable (bool, optional): if ``True``, the storage will be made compatible with :func:`~torch.compile` at
             the cost of being executable in multiprocessed settings.
         stack_dim (int, optional): the stack dimension in terms of TensorDict batch sizes. Defaults to `0`.
+        device (str, optional): the device to use for the storage. Defaults to `None` (inputs are not moved to the device).
     Examples:
         >>> import torch
@@ -421,8 +447,9 @@ class LazyStackStorage(ListStorage):
         *,
         compilable: bool = False,
         stack_dim: int = 0,
+        device: torch.device | str | int | None = None,
     ):
-        super().__init__(max_size=max_size, compilable=compilable)
+        super().__init__(max_size=max_size, compilable=compilable, device=device)
         self.stack_dim = stack_dim
     def get(self, index: int | Sequence[int] | slice) -> Any:

torchrl/envs/llm/__init__.py CHANGED Viewed

@@ -22,12 +22,14 @@ from .transforms import (
     KLRewardTransform,
     MCPToolTransform,
     PythonInterpreter,
+    RetrieveLogProb,
     TemplateTransform,
     Tokenizer,
 )
 __all__ = [
     "BrowserTransform",
+    "RetrieveLogProb",
     "ChatEnv",
     "DataLoadingPrimer",
     "DatasetChatEnv",

torchrl/envs/llm/chat.py CHANGED Viewed

@@ -206,7 +206,10 @@ class ChatEnv(EnvBase):
                 if lh.role != self.policy_role:
                     raise ValueError(
                         "The role received in the last block parsed from the policy "
-                        f"output does not match the expected policy role: received {lh.role} but expected {self.policy_role}."
+                        f"output does not match the expected policy role: received {lh.role} but expected {self.policy_role}.\n"
+                        f"Parsed input: {text=}\n"
+                        f"Parsed history: {parsed_history=}\n"
+                        f"Final element: {local_history=}"
                     )
         # Append history item
         history = history.append(local_history, inplace=False)

torchrl/envs/llm/reward/gsm8k.py CHANGED Viewed

@@ -145,25 +145,32 @@ class GSM8KRewardParser(Transform):
             potential_answer = [potential_answer]
         if isinstance(cot, str):
             cot = [cot]
-        reward_answer = 5.0 * (len(potential_answer) == 1)
+        # Format quality rewards (always applied)
+        reward_answer = 5.0 * (len(potential_answer) == 1)
         reward_think = 5.0 * (len(cot) == 1)
-        # One of the answer tags has the right answer
+        # Answer correctness rewards
         reward_right = 20.0 * (
             any(attempt == true_answer for attempt in potential_answer)
         )
-        # One of the answer tags contains the right answer (might be e.g. $20 instead of 20)
         reward_contained = 10.0 * (
             any((true_answer in attempt) for attempt in potential_answer)
         )
         success = len(potential_answer) > 0 and potential_answer[-1] == true_answer
-        # Compose the rewards
-        reward = 100.0 * float(success) + (
-            reward_answer + reward_think + reward_contained + reward_right
-        ) * (1 - float(success))
+        # Base success reward (lower than before to make format quality more important)
+        base_success_reward = 60.0 if success else 0.0
+        # Compose the rewards - always include format quality, even when successful
+        reward = (
+            base_success_reward
+            + reward_answer
+            + reward_think
+            + reward_contained
+            + reward_right
+        )
         rewards = TensorDict(
             reward_answer=reward_answer,

torchrl/envs/llm/transforms/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@
 from .browser import BrowserTransform
 from .dataloading import as_nested_tensor, as_padded_tensor, DataLoadingPrimer
 from .format import TemplateTransform
-from .kl import KLRewardTransform
+from .kl import KLRewardTransform, RetrieveLogProb
 from .policy_version import PolicyVersion
 from .tokenizer import Tokenizer
 from .tools import MCPToolTransform, PythonInterpreter
@@ -15,6 +15,7 @@ __all__ = [
     "BrowserTransform",
     "DataLoadingPrimer",
     "KLRewardTransform",
+    "RetrieveLogProb",
     "MCPToolTransform",
     "PolicyVersion",
     "PythonInterpreter",