PyPI - torchrl-nightly - Versions diffs - 2025.6.20__cp310-cp310-win_amd64.whl → 2025.6.22__cp310-cp310-win_amd64.whl - Mend

torchrl-nightly 2025.6.20__cp310-cp310-win_amd64.whl → 2025.6.22__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

torchrl/_torchrl.cp310-win_amd64.pyd +0 -0
torchrl/collectors/collectors.py +8 -5
torchrl/collectors/llm/base.py +13 -6
torchrl/collectors/llm/ray_collector.py +3 -0
torchrl/data/__init__.py +2 -0
torchrl/data/llm/__init__.py +2 -0
torchrl/data/llm/chat.py +59 -8
torchrl/data/llm/topk.py +186 -0
torchrl/data/replay_buffers/ray_buffer.py +15 -1
torchrl/data/replay_buffers/replay_buffers.py +50 -11
torchrl/data/replay_buffers/samplers.py +98 -21
torchrl/data/replay_buffers/storages.py +29 -2
torchrl/envs/llm/__init__.py +2 -0
torchrl/envs/llm/chat.py +4 -1
torchrl/envs/llm/reward/gsm8k.py +15 -8
torchrl/envs/llm/transforms/__init__.py +2 -1
torchrl/envs/llm/transforms/kl.py +240 -4
torchrl/modules/llm/policies/transformers_wrapper.py +71 -15
torchrl/modules/llm/policies/vllm_wrapper.py +38 -5
torchrl/objectives/llm/__init__.py +2 -1
torchrl/objectives/llm/sft.py +465 -0
torchrl/version.py +2 -2
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.22.dist-info}/METADATA +1 -1
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.22.dist-info}/RECORD +27 -25
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.22.dist-info}/LICENSE +0 -0
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.22.dist-info}/WHEEL +0 -0
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.22.dist-info}/top_level.txt +0 -0

torchrl/data/replay_buffers/samplers.py CHANGED Viewed

@@ -291,17 +291,38 @@ class SamplerWithoutReplacement(Sampler):
 class PrioritizedSampler(Sampler):
-    """Prioritized sampler for replay buffer.
+    r"""Prioritized sampler for replay buffer.
-    Presented in "Schaul, T.; Quan, J.; Antonoglou, I.; and Silver, D. 2015. Prioritized experience replay." (https://arxiv.org/abs/1511.05952)
+    This sampler implements Prioritized Experience Replay (PER) as presented in
+    "Schaul, T.; Quan, J.; Antonoglou, I.; and Silver, D. 2015. Prioritized experience replay."
+    (https://arxiv.org/abs/1511.05952)
+    **Core Idea**: Instead of sampling experiences uniformly from the replay buffer,
+    PER samples experiences with probability proportional to their "importance" - typically
+    measured by the magnitude of their temporal-difference (TD) error. This prioritization
+    can lead to faster learning by focusing on experiences that are most informative.
+    **How it works**:
+    1. Each experience is assigned a priority based on its TD error: :math:`p_i = |\delta_i| + \epsilon`
+    2. Sampling probability is computed as: :math:`P(i) = \frac{p_i^\alpha}{\sum_j p_j^\alpha}`
+    3. Importance sampling weights correct for the bias: :math:`w_i = (N \cdot P(i))^{-\beta}`
     Args:
         max_capacity (int): maximum capacity of the buffer.
-        alpha (:obj:`float`): exponent α determines how much prioritization is used,
-            with α = 0 corresponding to the uniform case.
-        beta (:obj:`float`): importance sampling negative exponent.
-        eps (:obj:`float`, optional): delta added to the priorities to ensure that the buffer
-            does not contain null priorities. Defaults to 1e-8.
+        alpha (:obj:`float`): exponent :math:`\alpha` determines how much prioritization is used.
+            - :math:`\alpha = 0`: uniform sampling (no prioritization)
+            - :math:`\alpha = 1`: full prioritization based on TD error magnitude
+            - Typical values: 0.4-0.7 for balanced prioritization
+            - Higher :math:`\alpha` means more aggressive prioritization of high-error experiences
+        beta (:obj:`float`): importance sampling negative exponent :math:`\beta`.
+            - :math:`\beta` controls the correction for the bias introduced by prioritization
+            - :math:`\beta = 0`: no correction (biased towards high-priority samples)
+            - :math:`\beta = 1`: full correction (unbiased but potentially unstable)
+            - Typical values: start at 0.4-0.6 and anneal to 1.0 during training
+            - Lower :math:`\beta` early in training provides stability, higher :math:`\beta` later reduces bias
+        eps (:obj:`float`, optional): small constant added to priorities to ensure
+            no experience has zero priority. This prevents experiences from never
+            being sampled. Defaults to 1e-8.
         reduction (str, optional): the reduction method for multidimensional
             tensordicts (ie stored trajectory). Can be one of "max", "min",
             "median" or "mean".
@@ -309,6 +330,23 @@ class PrioritizedSampler(Sampler):
             is tracked within the buffer. When ``False``, the max-priority tracks
             the maximum value since the instantiation of the sampler.
+    **Parameter Guidelines**:
+    - **:math:`\alpha` (alpha)**: Controls how much to prioritize high-error experiences
+        - 0.4-0.7: Good balance between learning speed and stability
+        - 1.0: Maximum prioritization (may be unstable)
+        - 0.0: Uniform sampling (no prioritization benefit)
+    - **:math:`\beta` (beta)**: Controls importance sampling correction
+        - Start at 0.4-0.6 for training stability
+        - Anneal to 1.0 over training to reduce bias
+        - Lower values = more stable but biased
+        - Higher values = less biased but potentially unstable
+    - **:math:`\epsilon`**: Small constant to prevent zero priorities
+        - 1e-8: Good default value
+        - Too small: may cause numerical issues
+        - Too large: reduces prioritization effect
     Examples:
         >>> from torchrl.data.replay_buffers import ReplayBuffer, LazyTensorStorage, PrioritizedSampler
         >>> from tensordict import TensorDict
@@ -412,7 +450,7 @@ class PrioritizedSampler(Sampler):
             )
         return super().__getstate__()
-    def _init(self):
+    def _init(self) -> None:
         if self.dtype in (torch.float, torch.FloatType, torch.float32):
             self._sum_tree = SumSegmentTreeFp32(self._max_capacity)
             self._min_tree = MinSegmentTreeFp32(self._max_capacity)
@@ -425,21 +463,23 @@ class PrioritizedSampler(Sampler):
             )
         self._max_priority = None
-    def _empty(self):
+    def _empty(self) -> None:
         self._init()
     @property
-    def _max_priority(self):
+    def _max_priority(self) -> tuple[float | None, int | None]:
         max_priority_index = self.__dict__.get("_max_priority")
         if max_priority_index is None:
             return (None, None)
         return max_priority_index
     @_max_priority.setter
-    def _max_priority(self, value):
+    def _max_priority(self, value: tuple[float | None, int | None]) -> None:
         self.__dict__["_max_priority"] = value
-    def _maybe_erase_max_priority(self, index):
+    def _maybe_erase_max_priority(
+        self, index: torch.Tensor | int | slice | tuple
+    ) -> None:
         if not self._max_priority_within_buffer:
             return
         max_priority_index = self._max_priority[1]
@@ -1839,11 +1879,21 @@ class SliceSamplerWithoutReplacement(SliceSampler, SamplerWithoutReplacement):
 class PrioritizedSliceSampler(SliceSampler, PrioritizedSampler):
-    """Samples slices of data along the first dimension, given start and stop signals, using prioritized sampling.
+    r"""Samples slices of data along the first dimension, given start and stop signals, using prioritized sampling.
+    This class combines trajectory sampling with Prioritized Experience Replay (PER) as presented in
+    "Schaul, T.; Quan, J.; Antonoglou, I.; and Silver, D. 2015. Prioritized experience replay."
+    (https://arxiv.org/abs/1511.05952)
+    **Core Idea**: Instead of sampling trajectory slices uniformly, this sampler prioritizes
+    trajectory start points based on the importance of the transitions at those positions.
+    This allows focusing learning on the most informative parts of trajectories.
-    This class samples sub-trajectories with replacement following a priority weighting presented in "Schaul, T.; Quan, J.; Antonoglou, I.; and Silver, D. 2015.
-        Prioritized experience replay."
-        (https://arxiv.org/abs/1511.05952)
+    **How it works**:
+    1. Each transition is assigned a priority based on its TD error: :math:`p_i = |\\delta_i| + \\epsilon`
+    2. Trajectory start points are sampled with probability: :math:`P(i) = \frac{p_i^\alpha}{\\sum_j p_j^\alpha}`
+    3. Importance sampling weights correct for bias: :math:`w_i = (N \\cdot P(i))^{-\beta}`
+    4. Complete trajectory slices are extracted from the sampled start points
     For more info see :class:`~torchrl.data.replay_buffers.samplers.SliceSampler` and :class:`~torchrl.data.replay_buffers.samplers.PrioritizedSampler`.
@@ -1855,15 +1905,42 @@ class PrioritizedSliceSampler(SliceSampler, PrioritizedSampler):
         :meth:`update_priority`.
     Args:
-        alpha (:obj:`float`): exponent α determines how much prioritization is used,
-            with α = 0 corresponding to the uniform case.
-        beta (:obj:`float`): importance sampling negative exponent.
-        eps (:obj:`float`, optional): delta added to the priorities to ensure that the buffer
-            does not contain null priorities. Defaults to 1e-8.
+        max_capacity (int): maximum capacity of the buffer.
+        alpha (:obj:`float`): exponent :math:`\alpha` determines how much prioritization is used.
+            - :math:`\alpha = 0`: uniform sampling of trajectory start points
+            - :math:`\alpha = 1`: full prioritization based on TD error magnitude at start points
+            - Typical values: 0.4-0.7 for balanced prioritization
+            - Higher :math:`\alpha` means more aggressive prioritization of high-error trajectory regions
+        beta (:obj:`float`): importance sampling negative exponent :math:`\beta`.
+            - :math:`\beta` controls the correction for the bias introduced by prioritization
+            - :math:`\beta = 0`: no correction (biased towards high-priority trajectory regions)
+            - :math:`\beta = 1`: full correction (unbiased but potentially unstable)
+            - Typical values: start at 0.4-0.6 and anneal to 1.0 during training
+            - Lower :math:`\beta` early in training provides stability, higher :math:`\beta` later reduces bias
+        eps (:obj:`float`, optional): small constant added to priorities to ensure
+            no transition has zero priority. This prevents trajectory regions from never
+            being sampled. Defaults to 1e-8.
         reduction (str, optional): the reduction method for multidimensional
             tensordicts (i.e., stored trajectory). Can be one of "max", "min",
             "median" or "mean".
+    **Parameter Guidelines**:
+    - **:math:`\alpha` (alpha)**: Controls how much to prioritize high-error trajectory regions
+        - 0.4-0.7: Good balance between learning speed and stability
+        - 1.0: Maximum prioritization (may be unstable)
+        - 0.0: Uniform sampling (no prioritization benefit)
+    - **:math:`\beta` (beta)**: Controls importance sampling correction
+        - Start at 0.4-0.6 for training stability
+        - Anneal to 1.0 over training to reduce bias
+        - Lower values = more stable but biased
+        - Higher values = less biased but potentially unstable
+    - **:math:`\\epsilon`**: Small constant to prevent zero priorities
+        - 1e-8: Good default value
+        - Too small: may cause numerical issues
+        - Too large: reduces prioritization effect
     Keyword Args:
         num_slices (int): the number of slices to be sampled. The batch-size
             must be greater or equal to the ``num_slices`` argument. Exclusive

torchrl/data/replay_buffers/storages.py CHANGED Viewed

@@ -230,15 +230,38 @@ class ListStorage(Storage):
         max_size (int, optional): the maximum number of elements stored in the storage.
             If not provided, an unlimited storage is created.
+    Keyword Args:
+        compilable (bool, optional): if ``True``, the storage will be made compatible with :func:`~torch.compile` at
+            the cost of being executable in multiprocessed settings.
+        device (str, optional): the device to use for the storage. Defaults to `None` (inputs are not moved to the device).
     """
     _default_checkpointer = ListStorageCheckpointer
-    def __init__(self, max_size: int | None = None, compilable: bool = False):
+    def __init__(
+        self,
+        max_size: int | None = None,
+        *,
+        compilable: bool = False,
+        device: torch.device | str | int | None = None,
+    ):
         if max_size is None:
             max_size = torch.iinfo(torch.int64).max
         super().__init__(max_size, compilable=compilable)
         self._storage = []
+        self.device = device
+    def _to_device(self, data: Any) -> Any:
+        """Utility method to move data to the device."""
+        if self.device is not None:
+            if hasattr(data, "to"):
+                data = data.to(self.device)
+            else:
+                data = tree_map(
+                    lambda x: x.to(self.device) if hasattr(x, "to") else x, data
+                )
+        return data
     def set(
         self,
@@ -254,6 +277,7 @@ class ListStorage(Storage):
                 self.set(int(cursor), data, set_cursor=set_cursor)
                 return
             if isinstance(cursor, slice):
+                data = self._to_device(data)
                 self._storage[cursor] = data
                 return
             if isinstance(
@@ -290,6 +314,7 @@ class ListStorage(Storage):
                     f"maximum capacity is {self.max_size} "
                     f"and the index of the item to be set is {cursor}."
                 )
+            data = self._to_device(data)
             if cursor == len(self._storage):
                 self._storage.append(data)
             else:
@@ -387,6 +412,7 @@ class LazyStackStorage(ListStorage):
         compilable (bool, optional): if ``True``, the storage will be made compatible with :func:`~torch.compile` at
             the cost of being executable in multiprocessed settings.
         stack_dim (int, optional): the stack dimension in terms of TensorDict batch sizes. Defaults to `0`.
+        device (str, optional): the device to use for the storage. Defaults to `None` (inputs are not moved to the device).
     Examples:
         >>> import torch
@@ -421,8 +447,9 @@ class LazyStackStorage(ListStorage):
         *,
         compilable: bool = False,
         stack_dim: int = 0,
+        device: torch.device | str | int | None = None,
     ):
-        super().__init__(max_size=max_size, compilable=compilable)
+        super().__init__(max_size=max_size, compilable=compilable, device=device)
         self.stack_dim = stack_dim
     def get(self, index: int | Sequence[int] | slice) -> Any:

torchrl/envs/llm/__init__.py CHANGED Viewed

@@ -22,12 +22,14 @@ from .transforms import (
     KLRewardTransform,
     MCPToolTransform,
     PythonInterpreter,
+    RetrieveLogProb,
     TemplateTransform,
     Tokenizer,
 )
 __all__ = [
     "BrowserTransform",
+    "RetrieveLogProb",
     "ChatEnv",
     "DataLoadingPrimer",
     "DatasetChatEnv",

torchrl/envs/llm/chat.py CHANGED Viewed

@@ -206,7 +206,10 @@ class ChatEnv(EnvBase):
                 if lh.role != self.policy_role:
                     raise ValueError(
                         "The role received in the last block parsed from the policy "
-                        f"output does not match the expected policy role: received {lh.role} but expected {self.policy_role}."
+                        f"output does not match the expected policy role: received {lh.role} but expected {self.policy_role}.\n"
+                        f"Parsed input: {text=}\n"
+                        f"Parsed history: {parsed_history=}\n"
+                        f"Final element: {local_history=}"
                     )
         # Append history item
         history = history.append(local_history, inplace=False)

torchrl/envs/llm/reward/gsm8k.py CHANGED Viewed

@@ -145,25 +145,32 @@ class GSM8KRewardParser(Transform):
             potential_answer = [potential_answer]
         if isinstance(cot, str):
             cot = [cot]
-        reward_answer = 5.0 * (len(potential_answer) == 1)
+        # Format quality rewards (always applied)
+        reward_answer = 5.0 * (len(potential_answer) == 1)
         reward_think = 5.0 * (len(cot) == 1)
-        # One of the answer tags has the right answer
+        # Answer correctness rewards
         reward_right = 20.0 * (
             any(attempt == true_answer for attempt in potential_answer)
         )
-        # One of the answer tags contains the right answer (might be e.g. $20 instead of 20)
         reward_contained = 10.0 * (
             any((true_answer in attempt) for attempt in potential_answer)
         )
         success = len(potential_answer) > 0 and potential_answer[-1] == true_answer
-        # Compose the rewards
-        reward = 100.0 * float(success) + (
-            reward_answer + reward_think + reward_contained + reward_right
-        ) * (1 - float(success))
+        # Base success reward (lower than before to make format quality more important)
+        base_success_reward = 60.0 if success else 0.0
+        # Compose the rewards - always include format quality, even when successful
+        reward = (
+            base_success_reward
+            + reward_answer
+            + reward_think
+            + reward_contained
+            + reward_right
+        )
         rewards = TensorDict(
             reward_answer=reward_answer,

torchrl/envs/llm/transforms/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@
 from .browser import BrowserTransform
 from .dataloading import as_nested_tensor, as_padded_tensor, DataLoadingPrimer
 from .format import TemplateTransform
-from .kl import KLRewardTransform
+from .kl import KLRewardTransform, RetrieveLogProb
 from .policy_version import PolicyVersion
 from .tokenizer import Tokenizer
 from .tools import MCPToolTransform, PythonInterpreter
@@ -15,6 +15,7 @@ __all__ = [
     "BrowserTransform",
     "DataLoadingPrimer",
     "KLRewardTransform",
+    "RetrieveLogProb",
     "MCPToolTransform",
     "PolicyVersion",
     "PythonInterpreter",

torchrl/envs/llm/transforms/kl.py CHANGED Viewed

@@ -4,15 +4,25 @@
 # LICENSE file in the root directory of this source tree.
 from __future__ import annotations
+import contextlib
+import gc
 from copy import copy
 import torch
-from tensordict import NestedKey, TensorDictBase, unravel_key
+from tensordict import NestedKey, set_list_to_stack, TensorDictBase, unravel_key
 from tensordict.nn import ProbabilisticTensorDictModule
-from tensordict.utils import is_seq_of_nested_key
+from tensordict.utils import _zip_strict, is_seq_of_nested_key
 from torchrl.data import Composite, Unbounded
+from torchrl.data.llm.chat import History
 from torchrl.envs import EnvBase, Transform
 from torchrl.envs.transforms.utils import _set_missing_tolerance
+from torchrl.modules.llm.policies.common import CategoricalSequential
+try:
+    import transformers
+except ImportError:
+    transformers = None
 class KLRewardTransform(Transform):
@@ -141,8 +151,8 @@ class KLRewardTransform(Transform):
                 f"action_key is required. Please set a parent for the {type(self).__name__} to recover the action keys automatically, "
                 f"or pass the action_key argument directly to {type(self).__name__} constructor."
             )
-        action = tensordict.get(action_key, None)
-        if action is None:
+        response_txt = tensordict.get(action_key, None)
+        if response_txt is None:
             if not self.missing_tolerance:
                 raise RuntimeError(
                     f"Action with key {action_key} not found data {tensordict}"
@@ -269,3 +279,229 @@ class KLRewardTransform(Transform):
         observation_spec[self.out_keys[1]] = reward_spec.clone()
         return output_spec
+class RetrieveLogProb(Transform):
+    """A transform to retrieve the log-probs of a text given a reference model.
+    Args:
+        actor (CategoricalSequential): the reference model.
+    Keyword Args:
+        history_key (NestedKey): the key where the history is stored. Defaults to `"history"`.
+        log_prob_key (NestedKey): the key where the log-probs are stored. Defaults to `"ref_log_prob"`.
+        assistant_only (bool): whether to only retrieve the log-probs of the assistant tokens (i.e., steps of history
+            where the role is `"assistant"`). Defaults to `False`.
+            .. note:: The template must accommodate the `return_assistant_tokens_mask` keyword argument.
+                This may not be the case for all templates. In this case, you can pass a custom template to the `apply_chat_template` method
+                via the `tokenizer_kwargs` argument: `tokenizer_kwargs = {"chat_template_name": "qwen"}` or `tokenizer_kwargs = {"chat_template": my_template}.
+        tokenizer_kwargs (dict): the keyword arguments to pass to the tokenizer to be used to apply the chat template to the history when `assistant_only` is `True`.
+            To control the tokenization in the actor, pass the tokenizer kwargs to the actor constructor.
+            Defaults to `{"return_assistant_tokens_mask": True, "tokenize": True, "return_tensors": "pt", "padding": True, "add_generation_prompt": False}`.
+        tokenizer (transformers.AutoTokenizer): the tokenizer to be used to tokenize the input and compute the assitant mask. If not provided, the tokenizer will be inferred from the `actor`.
+        detach (bool): whether to exclude the log-probs from the gradient computation. Defaults to `True`.
+        device (torch.device): the device to use for tensor creation. Defaults to `None`.
+    Examples:
+        >>> from torchrl.data.llm.chat import History, _CHAT_TEMPLATES
+        >>> from torchrl.modules.llm import TransformersWrapper
+        >>> from torchrl.objectives.llm.sft import SFTLoss
+        >>> from transformers import AutoTokenizer, OPTConfig, OPTForCausalLM
+        >>> from tensordict import TensorDict, lazy_stack, set_list_to_stack
+        >>> import torch
+        >>>
+        >>> set_list_to_stack(True).set()
+        >>>
+        >>> # Create chat data
+        >>> chats = [
+        ...     [
+        ...         {"role": "system", "content": "You are a helpful assistant."},
+        ...         {"role": "user", "content": "Hello, how are you?"},
+        ...         {"role": "assistant", "content": "I'm doing well, thank you!"},
+        ...     ],
+        ...     [
+        ...         {"role": "system", "content": "You are a helpful assistant."},
+        ...         {"role": "user", "content": "What's the weather like?"},
+        ...         {"role": "assistant", "content": "I can't check the weather for you."},
+        ...     ],
+        ... ]
+        >>> history = History.from_chats(chats)
+        >>> print(f"Created history with shape: {history.shape}")
+        Created history with shape: torch.Size([2, 3])
+        >>>
+        >>> # Setup tokenizer and model
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+        >>> tokenizer.pad_token = tokenizer.eos_token
+        >>> tokenizer.chat_template = _CHAT_TEMPLATES["chatml_format"]
+        >>> model = OPTForCausalLM(OPTConfig()).eval()
+        >>>
+        >>> # Create training and reference policies
+        >>> policy_train = TransformersWrapper(
+        ...     model,
+        ...     tokenizer=tokenizer,
+        ...     generate=False,
+        ...     from_text=True,
+        ...     chat_template_name="qwen",
+        ... )
+        >>> policy_ref = TransformersWrapper(
+        ...     model,
+        ...     tokenizer=tokenizer,
+        ...     generate=False,
+        ...     from_text=True,
+        ...     return_log_probs=True,
+        ...     chat_template_name="qwen",
+        ... )
+        >>>
+        >>> # Create the RetrieveLogProb transform
+        >>> transform = RetrieveLogProb(
+        ...     policy_ref,
+        ...     assistant_only=True,
+        ...     tokenizer_kwargs={"chat_template_name": "qwen"},
+        ...     tokenizer=tokenizer,
+        ... )
+        >>>
+        >>> # Prepare data
+        >>> text = history[:, :-1].apply_chat_template(
+        ...     tokenizer=tokenizer, chat_template_name="qwen", add_generation_prompt=True
+        ... )
+        >>> text_response = history.apply_chat_template(
+        ...     tokenizer=tokenizer, chat_template_name="qwen", add_generation_prompt=False
+        ... )
+        >>> text_response = [
+        ...     txt[len(txt_start):] for txt, txt_start in zip(text_response, text)
+        ... ]
+        >>> td = TensorDict(
+        ...     text=text,
+        ...     text_response=text_response,
+        ...     history=history,
+        ...     next=TensorDict(
+        ...         reward=torch.randn(2, 1),
+        ...         done=torch.zeros(2, dtype=torch.bool),
+        ...         history=history,
+        ...     ),
+        ...     batch_size=(2,),
+        ... )
+        >>> data = lazy_stack(list(td.unbind(0)))
+        >>>
+        >>> # Apply the transform to get reference log probabilities
+        >>> data = transform(data)
+        >>> # You can get a padded tensor for batching:
+        >>> ref_log_probs = data.get(("next", "ref_log_prob"), as_padded_tensor=True)
+        >>> print(f"Type: {type(ref_log_probs)}, Length: {len(ref_log_probs)}")
+        Type: <class 'torch.Tensor'>, Length: 2
+        >>> print(f"Example shapes: {[x.shape for x in ref_log_probs]}")
+        Example shapes: [torch.Size([35]), torch.Size([35])]
+        >>> print(ref_log_probs.shape)  # (batch, max_seq_len)
+        torch.Size([2, 35])
+        >>>
+        >>> # Use with SFTLoss for KL regularization
+        >>> loss = SFTLoss(
+        ...     actor_network=policy_train,
+        ...     tokenizer=tokenizer,
+        ...     reduction="mean",
+        ...     normalize_by_seq_length=True,
+        ...     kl_to_ref_coeff=0.1,
+        ...     tokenizer_kwargs={"chat_template_name": "qwen"},
+        ... )
+        >>> loss_vals = loss(data)
+        >>> print(f"SFT Loss: {loss_vals.loss_sft.item():.4f}")
+        SFT Loss: 10.7856
+        >>> print(f"KL to Reference Loss: {loss_vals.loss_kl_to_ref.item():.4f}")
+        KL to Reference Loss: 0.0000
+        >>> print(f"Total Loss: {loss_vals.sum(reduce=True).item():.4f}")
+        Total Loss: 10.7856
+    Note:
+        By default, the log-probabilities are stored as a list of tensors (one per sample, with variable length).
+        Use `as_padded_tensor=True` in `.get()` to obtain a batchable tensor (with padding).
+        The reference log probabilities are computed only for assistant tokens when `assistant_only=True`.
+    """
+    def __init__(
+        self,
+        actor: CategoricalSequential,
+        *,
+        history_key: NestedKey | None = None,
+        log_prob_key: NestedKey = "ref_log_prob",
+        assistant_only: bool = False,
+        tokenizer_kwargs: dict | None = None,
+        detach: bool = True,
+        device: torch.device | None = None,
+        tokenizer: transformers.AutoTokenizer | None = None,
+    ):
+        if history_key is None:
+            history_key = "history"
+        self.history_key = history_key
+        self.log_prob_key = log_prob_key
+        super().__init__(in_keys=[history_key], out_keys=[log_prob_key])
+        self.actor = actor
+        if not getattr(actor, "return_log_probs", True):
+            raise ValueError(
+                "The actor must have `return_log_probs=True` to use the `AssistantLogProb` transform."
+            )
+        if getattr(actor, "generate", True):
+            raise ValueError(
+                "The actor must have `generate=False` to use the `AssistantLogProb` transform."
+            )
+        if not getattr(actor, "from_text", False):
+            raise ValueError(
+                "The actor must have `from_text=True` to use the `AssistantLogProb` transform. If `from_text=False` is required, please file an issue on GitHub."
+            )
+        # if getattr(self.actor, "tokenizer_kwargs", {}).get("add_generation_prompt", True):
+        # raise ValueError("The actor must have `tokenizer_kwargs['add_generation_prompt']=False` to use the `AssistantLogProb` transform.")
+        self.assistant_only = assistant_only
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+        tokenizer_kwargs.setdefault("return_assistant_tokens_mask", True)
+        tokenizer_kwargs.setdefault("tokenize", True)
+        tokenizer_kwargs.setdefault("return_tensors", "pt")
+        tokenizer_kwargs.setdefault("padding", False)
+        tokenizer_kwargs.setdefault("add_generation_prompt", False)
+        self.tokenizer_kwargs = tokenizer_kwargs
+        self.tokenizer = tokenizer
+        self.detach = detach
+        self.device = device
+    def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
+        next_td = self._step(tensordict, tensordict.get("next"))
+        return tensordict.set("next", next_td)
+    @set_list_to_stack(True)
+    def _step(
+        self, tensordict: TensorDictBase, next_tensordict: TensorDictBase
+    ) -> TensorDictBase:
+        td = next_tensordict.select(self.history_key)
+        with torch.device(
+            self.device
+        ) if self.device is not None else contextlib.nullcontext(), torch.no_grad() if self.detach else contextlib.nullcontext():
+            result = self.actor(td.select(self.history_key))
+            td.update(result.select(getattr(self.actor, "log_prob_key", "log_probs")))
+            td.rename_key_(
+                getattr(self.actor, "log_prob_key", "log_probs"), self.log_prob_key
+            )
+            if torch.cuda.is_available():
+                gc.collect()
+                torch.cuda.empty_cache()
+        if self.assistant_only:
+            with torch.device(
+                self.device
+            ) if self.device is not None else contextlib.nullcontext():
+                # Get assistant mask
+                history: History = td.get(self.history_key)
+                proc = history.apply_chat_template(
+                    tokenizer=self.actor.tokenizer
+                    if self.tokenizer is None
+                    else self.tokenizer,
+                    **self.tokenizer_kwargs,
+                )
+                assistant_masks = proc.get("assistant_masks", as_list=True)
+                log_probs = td.get(self.log_prob_key, as_list=True)
+                log_probs = [
+                    lp[mask.bool()]
+                    for lp, mask in _zip_strict(log_probs, assistant_masks)
+                ]
+                td = td.set(self.log_prob_key, log_probs)
+        return next_tensordict.update(td)