PyPI - torchrl-nightly - Versions diffs - 2025.6.19__cp39-cp39-win_amd64.whl → 2025.6.21__cp39-cp39-win_amd64.whl - Mend

torchrl-nightly 2025.6.19__cp39-cp39-win_amd64.whl → 2025.6.21__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

torchrl/_torchrl.cp39-win_amd64.pyd +0 -0
torchrl/collectors/collectors.py +49 -24
torchrl/collectors/llm/base.py +13 -6
torchrl/collectors/llm/ray_collector.py +3 -0
torchrl/data/__init__.py +2 -0
torchrl/data/datasets/minari_data.py +1 -1
torchrl/data/llm/__init__.py +2 -0
torchrl/data/llm/chat.py +59 -9
torchrl/data/llm/topk.py +186 -0
torchrl/data/replay_buffers/ray_buffer.py +15 -1
torchrl/data/replay_buffers/replay_buffers.py +50 -11
torchrl/data/replay_buffers/samplers.py +98 -21
torchrl/data/replay_buffers/storages.py +29 -2
torchrl/envs/llm/__init__.py +2 -0
torchrl/envs/llm/chat.py +4 -1
torchrl/envs/llm/reward/gsm8k.py +15 -8
torchrl/envs/llm/transforms/__init__.py +2 -1
torchrl/envs/llm/transforms/kl.py +240 -4
torchrl/envs/transforms/transforms.py +11 -27
torchrl/modules/llm/policies/transformers_wrapper.py +71 -15
torchrl/modules/llm/policies/vllm_wrapper.py +38 -5
torchrl/objectives/llm/__init__.py +2 -1
torchrl/objectives/llm/sft.py +465 -0
torchrl/objectives/ppo.py +35 -12
torchrl/version.py +2 -2
{torchrl_nightly-2025.6.19.dist-info → torchrl_nightly-2025.6.21.dist-info}/METADATA +1 -1
{torchrl_nightly-2025.6.19.dist-info → torchrl_nightly-2025.6.21.dist-info}/RECORD +30 -28
{torchrl_nightly-2025.6.19.dist-info → torchrl_nightly-2025.6.21.dist-info}/LICENSE +0 -0
{torchrl_nightly-2025.6.19.dist-info → torchrl_nightly-2025.6.21.dist-info}/WHEEL +0 -0
{torchrl_nightly-2025.6.19.dist-info → torchrl_nightly-2025.6.21.dist-info}/top_level.txt +0 -0

torchrl/envs/llm/transforms/kl.py CHANGED Viewed

@@ -4,15 +4,25 @@
 # LICENSE file in the root directory of this source tree.
 from __future__ import annotations
+import contextlib
+import gc
 from copy import copy
 import torch
-from tensordict import NestedKey, TensorDictBase, unravel_key
+from tensordict import NestedKey, set_list_to_stack, TensorDictBase, unravel_key
 from tensordict.nn import ProbabilisticTensorDictModule
-from tensordict.utils import is_seq_of_nested_key
+from tensordict.utils import _zip_strict, is_seq_of_nested_key
 from torchrl.data import Composite, Unbounded
+from torchrl.data.llm.chat import History
 from torchrl.envs import EnvBase, Transform
 from torchrl.envs.transforms.utils import _set_missing_tolerance
+from torchrl.modules.llm.policies.common import CategoricalSequential
+try:
+    import transformers
+except ImportError:
+    transformers = None
 class KLRewardTransform(Transform):
@@ -141,8 +151,8 @@ class KLRewardTransform(Transform):
                 f"action_key is required. Please set a parent for the {type(self).__name__} to recover the action keys automatically, "
                 f"or pass the action_key argument directly to {type(self).__name__} constructor."
             )
-        action = tensordict.get(action_key, None)
-        if action is None:
+        response_txt = tensordict.get(action_key, None)
+        if response_txt is None:
             if not self.missing_tolerance:
                 raise RuntimeError(
                     f"Action with key {action_key} not found data {tensordict}"
@@ -269,3 +279,229 @@ class KLRewardTransform(Transform):
         observation_spec[self.out_keys[1]] = reward_spec.clone()
         return output_spec
+class RetrieveLogProb(Transform):
+    """A transform to retrieve the log-probs of a text given a reference model.
+    Args:
+        actor (CategoricalSequential): the reference model.
+    Keyword Args:
+        history_key (NestedKey): the key where the history is stored. Defaults to `"history"`.
+        log_prob_key (NestedKey): the key where the log-probs are stored. Defaults to `"ref_log_prob"`.
+        assistant_only (bool): whether to only retrieve the log-probs of the assistant tokens (i.e., steps of history
+            where the role is `"assistant"`). Defaults to `False`.
+            .. note:: The template must accommodate the `return_assistant_tokens_mask` keyword argument.
+                This may not be the case for all templates. In this case, you can pass a custom template to the `apply_chat_template` method
+                via the `tokenizer_kwargs` argument: `tokenizer_kwargs = {"chat_template_name": "qwen"}` or `tokenizer_kwargs = {"chat_template": my_template}.
+        tokenizer_kwargs (dict): the keyword arguments to pass to the tokenizer to be used to apply the chat template to the history when `assistant_only` is `True`.
+            To control the tokenization in the actor, pass the tokenizer kwargs to the actor constructor.
+            Defaults to `{"return_assistant_tokens_mask": True, "tokenize": True, "return_tensors": "pt", "padding": True, "add_generation_prompt": False}`.
+        tokenizer (transformers.AutoTokenizer): the tokenizer to be used to tokenize the input and compute the assitant mask. If not provided, the tokenizer will be inferred from the `actor`.
+        detach (bool): whether to exclude the log-probs from the gradient computation. Defaults to `True`.
+        device (torch.device): the device to use for tensor creation. Defaults to `None`.
+    Examples:
+        >>> from torchrl.data.llm.chat import History, _CHAT_TEMPLATES
+        >>> from torchrl.modules.llm import TransformersWrapper
+        >>> from torchrl.objectives.llm.sft import SFTLoss
+        >>> from transformers import AutoTokenizer, OPTConfig, OPTForCausalLM
+        >>> from tensordict import TensorDict, lazy_stack, set_list_to_stack
+        >>> import torch
+        >>>
+        >>> set_list_to_stack(True).set()
+        >>>
+        >>> # Create chat data
+        >>> chats = [
+        ...     [
+        ...         {"role": "system", "content": "You are a helpful assistant."},
+        ...         {"role": "user", "content": "Hello, how are you?"},
+        ...         {"role": "assistant", "content": "I'm doing well, thank you!"},
+        ...     ],
+        ...     [
+        ...         {"role": "system", "content": "You are a helpful assistant."},
+        ...         {"role": "user", "content": "What's the weather like?"},
+        ...         {"role": "assistant", "content": "I can't check the weather for you."},
+        ...     ],
+        ... ]
+        >>> history = History.from_chats(chats)
+        >>> print(f"Created history with shape: {history.shape}")
+        Created history with shape: torch.Size([2, 3])
+        >>>
+        >>> # Setup tokenizer and model
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+        >>> tokenizer.pad_token = tokenizer.eos_token
+        >>> tokenizer.chat_template = _CHAT_TEMPLATES["chatml_format"]
+        >>> model = OPTForCausalLM(OPTConfig()).eval()
+        >>>
+        >>> # Create training and reference policies
+        >>> policy_train = TransformersWrapper(
+        ...     model,
+        ...     tokenizer=tokenizer,
+        ...     generate=False,
+        ...     from_text=True,
+        ...     chat_template_name="qwen",
+        ... )
+        >>> policy_ref = TransformersWrapper(
+        ...     model,
+        ...     tokenizer=tokenizer,
+        ...     generate=False,
+        ...     from_text=True,
+        ...     return_log_probs=True,
+        ...     chat_template_name="qwen",
+        ... )
+        >>>
+        >>> # Create the RetrieveLogProb transform
+        >>> transform = RetrieveLogProb(
+        ...     policy_ref,
+        ...     assistant_only=True,
+        ...     tokenizer_kwargs={"chat_template_name": "qwen"},
+        ...     tokenizer=tokenizer,
+        ... )
+        >>>
+        >>> # Prepare data
+        >>> text = history[:, :-1].apply_chat_template(
+        ...     tokenizer=tokenizer, chat_template_name="qwen", add_generation_prompt=True
+        ... )
+        >>> text_response = history.apply_chat_template(
+        ...     tokenizer=tokenizer, chat_template_name="qwen", add_generation_prompt=False
+        ... )
+        >>> text_response = [
+        ...     txt[len(txt_start):] for txt, txt_start in zip(text_response, text)
+        ... ]
+        >>> td = TensorDict(
+        ...     text=text,
+        ...     text_response=text_response,
+        ...     history=history,
+        ...     next=TensorDict(
+        ...         reward=torch.randn(2, 1),
+        ...         done=torch.zeros(2, dtype=torch.bool),
+        ...         history=history,
+        ...     ),
+        ...     batch_size=(2,),
+        ... )
+        >>> data = lazy_stack(list(td.unbind(0)))
+        >>>
+        >>> # Apply the transform to get reference log probabilities
+        >>> data = transform(data)
+        >>> # You can get a padded tensor for batching:
+        >>> ref_log_probs = data.get(("next", "ref_log_prob"), as_padded_tensor=True)
+        >>> print(f"Type: {type(ref_log_probs)}, Length: {len(ref_log_probs)}")
+        Type: <class 'torch.Tensor'>, Length: 2
+        >>> print(f"Example shapes: {[x.shape for x in ref_log_probs]}")
+        Example shapes: [torch.Size([35]), torch.Size([35])]
+        >>> print(ref_log_probs.shape)  # (batch, max_seq_len)
+        torch.Size([2, 35])
+        >>>
+        >>> # Use with SFTLoss for KL regularization
+        >>> loss = SFTLoss(
+        ...     actor_network=policy_train,
+        ...     tokenizer=tokenizer,
+        ...     reduction="mean",
+        ...     normalize_by_seq_length=True,
+        ...     kl_to_ref_coeff=0.1,
+        ...     tokenizer_kwargs={"chat_template_name": "qwen"},
+        ... )
+        >>> loss_vals = loss(data)
+        >>> print(f"SFT Loss: {loss_vals.loss_sft.item():.4f}")
+        SFT Loss: 10.7856
+        >>> print(f"KL to Reference Loss: {loss_vals.loss_kl_to_ref.item():.4f}")
+        KL to Reference Loss: 0.0000
+        >>> print(f"Total Loss: {loss_vals.sum(reduce=True).item():.4f}")
+        Total Loss: 10.7856
+    Note:
+        By default, the log-probabilities are stored as a list of tensors (one per sample, with variable length).
+        Use `as_padded_tensor=True` in `.get()` to obtain a batchable tensor (with padding).
+        The reference log probabilities are computed only for assistant tokens when `assistant_only=True`.
+    """
+    def __init__(
+        self,
+        actor: CategoricalSequential,
+        *,
+        history_key: NestedKey | None = None,
+        log_prob_key: NestedKey = "ref_log_prob",
+        assistant_only: bool = False,
+        tokenizer_kwargs: dict | None = None,
+        detach: bool = True,
+        device: torch.device | None = None,
+        tokenizer: transformers.AutoTokenizer | None = None,
+    ):
+        if history_key is None:
+            history_key = "history"
+        self.history_key = history_key
+        self.log_prob_key = log_prob_key
+        super().__init__(in_keys=[history_key], out_keys=[log_prob_key])
+        self.actor = actor
+        if not getattr(actor, "return_log_probs", True):
+            raise ValueError(
+                "The actor must have `return_log_probs=True` to use the `AssistantLogProb` transform."
+            )
+        if getattr(actor, "generate", True):
+            raise ValueError(
+                "The actor must have `generate=False` to use the `AssistantLogProb` transform."
+            )
+        if not getattr(actor, "from_text", False):
+            raise ValueError(
+                "The actor must have `from_text=True` to use the `AssistantLogProb` transform. If `from_text=False` is required, please file an issue on GitHub."
+            )
+        # if getattr(self.actor, "tokenizer_kwargs", {}).get("add_generation_prompt", True):
+        # raise ValueError("The actor must have `tokenizer_kwargs['add_generation_prompt']=False` to use the `AssistantLogProb` transform.")
+        self.assistant_only = assistant_only
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+        tokenizer_kwargs.setdefault("return_assistant_tokens_mask", True)
+        tokenizer_kwargs.setdefault("tokenize", True)
+        tokenizer_kwargs.setdefault("return_tensors", "pt")
+        tokenizer_kwargs.setdefault("padding", False)
+        tokenizer_kwargs.setdefault("add_generation_prompt", False)
+        self.tokenizer_kwargs = tokenizer_kwargs
+        self.tokenizer = tokenizer
+        self.detach = detach
+        self.device = device
+    def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
+        next_td = self._step(tensordict, tensordict.get("next"))
+        return tensordict.set("next", next_td)
+    @set_list_to_stack(True)
+    def _step(
+        self, tensordict: TensorDictBase, next_tensordict: TensorDictBase
+    ) -> TensorDictBase:
+        td = next_tensordict.select(self.history_key)
+        with torch.device(
+            self.device
+        ) if self.device is not None else contextlib.nullcontext(), torch.no_grad() if self.detach else contextlib.nullcontext():
+            result = self.actor(td.select(self.history_key))
+            td.update(result.select(getattr(self.actor, "log_prob_key", "log_probs")))
+            td.rename_key_(
+                getattr(self.actor, "log_prob_key", "log_probs"), self.log_prob_key
+            )
+            if torch.cuda.is_available():
+                gc.collect()
+                torch.cuda.empty_cache()
+        if self.assistant_only:
+            with torch.device(
+                self.device
+            ) if self.device is not None else contextlib.nullcontext():
+                # Get assistant mask
+                history: History = td.get(self.history_key)
+                proc = history.apply_chat_template(
+                    tokenizer=self.actor.tokenizer
+                    if self.tokenizer is None
+                    else self.tokenizer,
+                    **self.tokenizer_kwargs,
+                )
+                assistant_masks = proc.get("assistant_masks", as_list=True)
+                log_probs = td.get(self.log_prob_key, as_list=True)
+                log_probs = [
+                    lp[mask.bool()]
+                    for lp, mask in _zip_strict(log_probs, assistant_masks)
+                ]
+                td = td.set(self.log_prob_key, log_probs)
+        return next_tensordict.update(td)

torchrl/envs/transforms/transforms.py CHANGED Viewed

@@ -8726,8 +8726,10 @@ class Reward2GoTransform(Transform):
 class ActionMask(Transform):
     """An adaptive action masker.
-    This transform reads the mask from the input tensordict after the step is executed,
-    and adapts the mask of the one-hot / categorical action spec.
+    This transform is useful to ensure that randomly generated actions
+    respect legal actions, by masking the action specs.
+    It reads the mask from the input tensordict after the step is executed,
+    and adapts the mask of the finite action spec.
       .. note:: This transform will fail when used without an environment.
@@ -8773,8 +8775,6 @@ class ActionMask(Transform):
         >>> base_env = MaskedEnv()
         >>> env = TransformedEnv(base_env, ActionMask())
         >>> r = env.rollout(10)
-        >>> env = TransformedEnv(base_env, ActionMask())
-        >>> r = env.rollout(10)
         >>> r["action_mask"]
         tensor([[ True,  True,  True,  True],
                 [ True,  True, False,  True],
@@ -8810,15 +8810,8 @@ class ActionMask(Transform):
         raise RuntimeError(FORWARD_NOT_IMPLEMENTED.format(type(self)))
     @property
-    def action_spec(self):
-        action_spec = self.container.full_action_spec
-        keys = self.container.action_keys
-        if len(keys) == 1:
-            action_spec = action_spec[keys[0]]
-        else:
-            raise ValueError(
-                f"Too many action keys for {self.__class__.__name__}: {keys=}"
-            )
+    def action_spec(self) -> TensorSpec:
+        action_spec = self.container.full_action_spec[self.in_keys[0]]
         if not isinstance(action_spec, self.ACCEPTED_SPECS):
             raise ValueError(
                 self.SPEC_TYPE_ERROR.format(self.ACCEPTED_SPECS, type(action_spec))
@@ -8826,29 +8819,20 @@ class ActionMask(Transform):
         return action_spec
     def _call(self, next_tensordict: TensorDictBase) -> TensorDictBase:
-        parent = self.parent
-        if parent is None:
+        if self.parent is None:
             raise RuntimeError(
                 f"{type(self)}.parent cannot be None: make sure this transform is executed within an environment."
             )
         mask = next_tensordict.get(self.in_keys[1])
-        action_spec = self.action_spec
-        action_spec.update_mask(mask.to(action_spec.device))
+        self.action_spec.update_mask(mask.to(self.action_spec.device))
         return next_tensordict
     def _reset(
         self, tensordict: TensorDictBase, tensordict_reset: TensorDictBase
     ) -> TensorDictBase:
-        action_spec = self.action_spec
-        mask = tensordict.get(self.in_keys[1], None)
-        if mask is not None:
-            mask = mask.to(action_spec.device)
-        action_spec.update_mask(mask)
-        # TODO: Check that this makes sense
-        with _set_missing_tolerance(self, True):
-            tensordict_reset = self._call(tensordict_reset)
-        return tensordict_reset
+        return self._call(tensordict_reset)
 class VecGymEnvTransform(Transform):

torchrl/modules/llm/policies/transformers_wrapper.py CHANGED Viewed

@@ -65,6 +65,10 @@ class TransformersWrapper(CategoricalSequential):
             operations. If `True`, operations will be performed in-place. If `False`, a new TensorDict instance will be
             created. If `"empty"`, the output data structure will be initialized with `input.empty()` (i.e., it will
             conserve type, batch-size, and device). Defaults to `True`.
+        chat_template_name (Literal["chatml_format", "qwen"] | None, optional): The name of the chat template to use when
+            applying the chat template to the history. Defaults to `None`.
+        chat_template (str | None, optional): The chat template to use when applying the chat template to the history.
+            Defaults to `None`.
     .. note:: The tokenizer is used when `from_text` is `True` to convert input text into token sequences. It is also
         required (or retrieved) when `pad_output` is `True` or when using text inputs with `generate=False` to ensure proper
@@ -131,6 +135,8 @@ class TransformersWrapper(CategoricalSequential):
         tokenizer_kwargs: dict | None = None,
         pad_output: bool = True,
         inplace: Literal[True, False, "empty"] | None = True,
+        chat_template_name: Literal["chatml_format", "qwen"] | None = None,
+        chat_template: str | None = None,
     ):
         super().__init__()
@@ -143,6 +149,8 @@ class TransformersWrapper(CategoricalSequential):
         self.inplace = inplace
         self.pad_output = pad_output
         padding_value = None
+        self.chat_template_name = chat_template_name
+        self.chat_template = chat_template
         if not tokenizer_kwargs:
             tokenizer_kwargs = {}
@@ -300,7 +308,17 @@ class TransformersWrapper(CategoricalSequential):
                 raise ValueError(
                     "No text or history provided to the TransformersWrapper."
                 )
-            text = history.apply_chat_template(self.tokenizer)
+            tokenizer_kwargs = {}
+            if self.chat_template_name is not None:
+                tokenizer_kwargs.setdefault(
+                    "chat_template_name", self.chat_template_name
+                )
+            if self.chat_template is not None:
+                tokenizer_kwargs.setdefault("chat_template", self.chat_template)
+            tokenizer_kwargs.setdefault("add_generation_prompt", False)
+            text = history.apply_chat_template(
+                tokenizer=self.tokenizer, **tokenizer_kwargs
+            )
         if not isinstance(text, (list, str)):
             text = text.tolist()
         tokens_in = self.tokenizer(text, **self.tokenizer_kwargs)
@@ -325,7 +343,7 @@ class TransformersWrapper(CategoricalSequential):
             logits = torch.stack(list(tokens_out["logits"]), 1)
             logits = _unpad_tensors(logits, mask_sequences, as_nested=False)
             log_probs, logits = self._log_probs_generate(
-                sequences, logits, pad_val=pad_val
+                sequences, logits, pad_val=-100
             )
         response_text = self.tokenizer.batch_decode(
             sequences, skip_special_tokens=False
@@ -407,17 +425,36 @@ class TransformersWrapper(CategoricalSequential):
         pad_val = self.tokenizer.pad_token_id
         prompt_txt = td.get(self.text_key)
-        if prompt_txt is None:
+        response_txt = td.get(self.text_response_key)
+        if prompt_txt is None or response_txt is None:
+            if prompt_txt is not None and response_txt is not None:
+                raise ValueError(
+                    "No text or history provided to the TransformersWrapper. Either both are provided or none of them."
+                )
             # Fallback on history parsing
             history = td.get(self.history_key)
             if history is None:
                 raise ValueError(
                     "No text or history provided to the TransformersWrapper."
                 )
-            prompt_txt = history.apply_chat_template(self.tokenizer)
+            tokenizer_kwargs = {}
+            if self.chat_template_name is not None:
+                tokenizer_kwargs.setdefault(
+                    "chat_template_name", self.chat_template_name
+                )
+            if self.chat_template is not None:
+                tokenizer_kwargs.setdefault("chat_template", self.chat_template)
+            tokenizer_kwargs.setdefault("add_generation_prompt", False)
+            response_txt = history.apply_chat_template(
+                tokenizer=self.tokenizer, **tokenizer_kwargs
+            )
+            if isinstance(response_txt, list):
+                prompt_txt = ["" for _ in response_txt]
+            else:
+                prompt_txt = ""
         if not isinstance(prompt_txt, (list, str)):
             prompt_txt = prompt_txt.tolist()
-        response_txt = td.get(self.text_response_key)
         if not isinstance(response_txt, (list, str)):
             response_txt = response_txt.tolist()
         total_txt = [x + y for x, y in _zip_strict(prompt_txt, response_txt)]
@@ -450,6 +487,8 @@ class TransformersWrapper(CategoricalSequential):
         )
         sequences = [
             _total_input_ids[_prompt_input_ids.shape[-1] :]
+            if _prompt_input_ids.shape[-1] > 0
+            else _total_input_ids
             for _total_input_ids, _prompt_input_ids in zip(
                 total_input_ids, prompt_input_ids
             )
@@ -484,7 +523,7 @@ class TransformersWrapper(CategoricalSequential):
         total_input_ids = [
             torch.cat([_prompt_input_ids, _response_input_ids], -1)
-            for _prompt_input_ids, _response_input_ids in zip(
+            for _prompt_input_ids, _response_input_ids in _zip_strict(
                 prompt_input_ids, response_input_ids
             )
         ]
@@ -512,7 +551,7 @@ class TransformersWrapper(CategoricalSequential):
             total_input_ids, attention_mask=total_attention_mask, **kwargs
         )
         log_probs, logits = self._log_probs_from_logits(
-            total_tokens_out, response_input_ids, pad_val=pad_val
+            total_tokens_out, response_input_ids, pad_val=-100
         )
         # for i in range(log_probs.size(0)):
         #     assert log_probs[i].shape[-1] == response_input_ids[i].shape[-1]
@@ -522,7 +561,7 @@ class TransformersWrapper(CategoricalSequential):
         return out
     @classmethod
-    def _log_probs_from_logits(cls, total_tokens_out, response_input_ids, pad_val):
+    def _log_probs_from_logits(cls, total_tokens_out, response_input_ids, pad_val=-100):
         response_input_ids = pad_sequence(
             response_input_ids,
             padding_value=pad_val,
@@ -532,10 +571,21 @@ class TransformersWrapper(CategoricalSequential):
         pad_mask = response_input_ids != pad_val
         logits = total_tokens_out["logits"]
-        logits = logits.log_softmax(dim=-1)
-        logits = logits[:, -response_input_ids.shape[-1] - 1 : -1, :]
-        log_probs = logits.gather(-1, response_input_ids.unsqueeze(-1)).squeeze(-1)
+        # logits = logits.log_softmax(dim=-1)
+        if logits.shape[-2] != response_input_ids.shape[-1]:
+            logits = logits[..., -response_input_ids.shape[-1] - 1 : -1, :]
+        td = TensorDict(
+            logits=logits, response_input_ids=response_input_ids
+        ).auto_batch_size_()
+        with td.flatten() as tdflat:
+            tdflat["log_probs"] = -torch.nn.functional.cross_entropy(
+                tdflat["logits"],
+                tdflat["response_input_ids"],
+                reduce=False,
+                ignore_index=pad_val,
+            )
+        log_probs = td["log_probs"]
         # Recover the list
         log_probs = _unpad_tensors(log_probs, pad_mask)
@@ -543,7 +593,7 @@ class TransformersWrapper(CategoricalSequential):
         return log_probs, logits
     @classmethod
-    def _log_probs_generate(cls, sequences, logits, pad_val):
+    def _log_probs_generate(cls, sequences, logits, pad_val=-100):
         tokens = pad_sequence(
             sequences,
             padding_value=pad_val,
@@ -557,6 +607,12 @@ class TransformersWrapper(CategoricalSequential):
             padding_side="left",
         )
-        logits = logits.log_softmax(dim=-1)
-        log_probs = logits.gather(-1, tokens.unsqueeze(-1)).squeeze(-1)
+        # logits = logits.log_softmax(dim=-1)
+        # log_probs = logits.gather(-1, tokens.unsqueeze(-1)).squeeze(-1)
+        td = TensorDict(logits=logits, tokens=tokens).auto_batch_size_()
+        with td.flatten() as tdflat:
+            tdflat["log_probs"] = -torch.nn.functional.cross_entropy(
+                tdflat["logits"], tdflat["tokens"], reduce=False, ignore_index=pad_val
+            )
+        log_probs = td["log_probs"]
         return log_probs, logits

torchrl/modules/llm/policies/vllm_wrapper.py CHANGED Viewed

@@ -74,6 +74,9 @@ class vLLMWrapper(CategoricalSequential):
             conserve type, batch-size, and device). Defaults to `True` when generating a single sample, `False`
             otherwise.
+        chat_template_name (str | None, optional): The name of the chat template to use for the history. Defaults to `None`.
+        chat_template (str | None, optional): The chat template to use for the history. Defaults to `None`.
     .. note:: The tokenizer is used when `from_text` is `True` to convert input text into token sequences. It is also
         required (or retrieved) when `pad_output` is `True` or when using text inputs with `generate=False` to ensure proper
         tokenization and padding.
@@ -120,6 +123,7 @@ class vLLMWrapper(CategoricalSequential):
     token_response_key: NestedKey = ("tokens_response",)
     text_response_key: NestedKey = ("text_response",)
     attention_mask_key: NestedKey = ("attention_mask",)
+    history_key: NestedKey = ("history",)
     def __init__(
         self,
@@ -137,6 +141,8 @@ class vLLMWrapper(CategoricalSequential):
         tokenizer_kwargs: dict | None = None,
         pad_output: bool = False,
         inplace: Literal[True, False, "empty"] | None = None,
+        chat_template_name: str | None = None,
+        chat_template: str | None = None,
     ):
         super().__init__()
@@ -149,6 +155,8 @@ class vLLMWrapper(CategoricalSequential):
         self._device = device
         self.generate = generate
         self.pad_output = pad_output
+        self.chat_template_name = chat_template_name
+        self.chat_template = chat_template
         padding_value = None
         if not tokenizer_kwargs:
@@ -329,7 +337,12 @@ class vLLMWrapper(CategoricalSequential):
             history = td.get(self.history_key)
             if history is None:
                 raise ValueError("No text or history provided to the vLLMWrapper.")
-            text = history.apply_chat_template(self.tokenizer)
+            tokenizer_kwargs = {}
+            if self.chat_template_name is not None:
+                tokenizer_kwargs["chat_template_name"] = self.chat_template_name
+            if self.chat_template is not None:
+                tokenizer_kwargs["chat_template"] = self.chat_template
+            text = history.apply_chat_template(self.tokenizer, **tokenizer_kwargs)
         if self.pad_output:
             tokenizer_kwargs = self.tokenizer_kwargs
             if not isinstance(text, (list, str)):
@@ -385,15 +398,35 @@ class vLLMWrapper(CategoricalSequential):
     def _from_vllm_logprobs_text(self, td, sampling_params, out):
         text_prompt = td.get(self.text_key)
-        if text_prompt is None:
+        text_response = td.get(self.text_response_key)
+        if text_response is None or text_prompt is None:
+            if text_response is not None and text_prompt is not None:
+                raise ValueError(
+                    "No text or history provided to the vLLMWrapper. Either both are provided or none of them."
+                )
             # Fallback on history parsing
             history = td.get(self.history_key)
             if history is None:
-                raise ValueError("No text or history provided to the vLLMWrapper.")
-            text_prompt = history.apply_chat_template(self.tokenizer)
+                raise ValueError(
+                    "No text or history provided to the TransformersWrapper."
+                )
+            tokenizer_kwargs = {}
+            if self.chat_template_name is not None:
+                tokenizer_kwargs.setdefault(
+                    "chat_template_name", self.chat_template_name
+                )
+            if self.chat_template is not None:
+                tokenizer_kwargs.setdefault("chat_template", self.chat_template)
+            tokenizer_kwargs.setdefault("add_generation_prompt", False)
+            text_response = history.apply_chat_template(
+                tokenizer=self.tokenizer, **tokenizer_kwargs
+            )
+            if isinstance(text_response, list):
+                text_prompt = ["" for _ in text_response]
+            else:
+                text_prompt = ""
         if not isinstance(text_prompt, list):
             text_prompt = text_prompt.tolist()
-        text_response = td.get(self.text_response_key)
         if not isinstance(text_response, list):
             text_response = text_response.tolist()
         text = [_x + _y for _x, _y in _zip_strict(text_prompt, text_response)]

torchrl/objectives/llm/__init__.py CHANGED Viewed

@@ -5,5 +5,6 @@
 from __future__ import annotations
 from .grpo import GRPOLoss, GRPOLossOutput, MCAdvantage
+from .sft import SFTLoss, SFTLossOutput
-__all__ = ["GRPOLoss", "GRPOLossOutput", "MCAdvantage"]
+__all__ = ["GRPOLoss", "GRPOLossOutput", "MCAdvantage", "SFTLoss", "SFTLossOutput"]