PyPI - torchrl-nightly - Versions diffs - 2025.6.20__cp313-cp313-win_amd64.whl → 2025.6.22__cp313-cp313-win_amd64.whl - Mend

torchrl-nightly 2025.6.20__cp313-cp313-win_amd64.whl → 2025.6.22__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

torchrl/_torchrl.cp313-win_amd64.pyd +0 -0
torchrl/collectors/collectors.py +8 -5
torchrl/collectors/llm/base.py +13 -6
torchrl/collectors/llm/ray_collector.py +3 -0
torchrl/data/__init__.py +2 -0
torchrl/data/llm/__init__.py +2 -0
torchrl/data/llm/chat.py +59 -8
torchrl/data/llm/topk.py +186 -0
torchrl/data/replay_buffers/ray_buffer.py +15 -1
torchrl/data/replay_buffers/replay_buffers.py +50 -11
torchrl/data/replay_buffers/samplers.py +98 -21
torchrl/data/replay_buffers/storages.py +29 -2
torchrl/envs/llm/__init__.py +2 -0
torchrl/envs/llm/chat.py +4 -1
torchrl/envs/llm/reward/gsm8k.py +15 -8
torchrl/envs/llm/transforms/__init__.py +2 -1
torchrl/envs/llm/transforms/kl.py +240 -4
torchrl/modules/llm/policies/transformers_wrapper.py +71 -15
torchrl/modules/llm/policies/vllm_wrapper.py +38 -5
torchrl/objectives/llm/__init__.py +2 -1
torchrl/objectives/llm/sft.py +465 -0
torchrl/version.py +2 -2
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.22.dist-info}/METADATA +1 -1
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.22.dist-info}/RECORD +27 -25
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.22.dist-info}/LICENSE +0 -0
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.22.dist-info}/WHEEL +0 -0
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.22.dist-info}/top_level.txt +0 -0

torchrl/modules/llm/policies/transformers_wrapper.py CHANGED Viewed

@@ -65,6 +65,10 @@ class TransformersWrapper(CategoricalSequential):
             operations. If `True`, operations will be performed in-place. If `False`, a new TensorDict instance will be
             created. If `"empty"`, the output data structure will be initialized with `input.empty()` (i.e., it will
             conserve type, batch-size, and device). Defaults to `True`.
+        chat_template_name (Literal["chatml_format", "qwen"] | None, optional): The name of the chat template to use when
+            applying the chat template to the history. Defaults to `None`.
+        chat_template (str | None, optional): The chat template to use when applying the chat template to the history.
+            Defaults to `None`.
     .. note:: The tokenizer is used when `from_text` is `True` to convert input text into token sequences. It is also
         required (or retrieved) when `pad_output` is `True` or when using text inputs with `generate=False` to ensure proper
@@ -131,6 +135,8 @@ class TransformersWrapper(CategoricalSequential):
         tokenizer_kwargs: dict | None = None,
         pad_output: bool = True,
         inplace: Literal[True, False, "empty"] | None = True,
+        chat_template_name: Literal["chatml_format", "qwen"] | None = None,
+        chat_template: str | None = None,
     ):
         super().__init__()
@@ -143,6 +149,8 @@ class TransformersWrapper(CategoricalSequential):
         self.inplace = inplace
         self.pad_output = pad_output
         padding_value = None
+        self.chat_template_name = chat_template_name
+        self.chat_template = chat_template
         if not tokenizer_kwargs:
             tokenizer_kwargs = {}
@@ -300,7 +308,17 @@ class TransformersWrapper(CategoricalSequential):
                 raise ValueError(
                     "No text or history provided to the TransformersWrapper."
                 )
-            text = history.apply_chat_template(self.tokenizer)
+            tokenizer_kwargs = {}
+            if self.chat_template_name is not None:
+                tokenizer_kwargs.setdefault(
+                    "chat_template_name", self.chat_template_name
+                )
+            if self.chat_template is not None:
+                tokenizer_kwargs.setdefault("chat_template", self.chat_template)
+            tokenizer_kwargs.setdefault("add_generation_prompt", False)
+            text = history.apply_chat_template(
+                tokenizer=self.tokenizer, **tokenizer_kwargs
+            )
         if not isinstance(text, (list, str)):
             text = text.tolist()
         tokens_in = self.tokenizer(text, **self.tokenizer_kwargs)
@@ -325,7 +343,7 @@ class TransformersWrapper(CategoricalSequential):
             logits = torch.stack(list(tokens_out["logits"]), 1)
             logits = _unpad_tensors(logits, mask_sequences, as_nested=False)
             log_probs, logits = self._log_probs_generate(
-                sequences, logits, pad_val=pad_val
+                sequences, logits, pad_val=-100
             )
         response_text = self.tokenizer.batch_decode(
             sequences, skip_special_tokens=False
@@ -407,17 +425,36 @@ class TransformersWrapper(CategoricalSequential):
         pad_val = self.tokenizer.pad_token_id
         prompt_txt = td.get(self.text_key)
-        if prompt_txt is None:
+        response_txt = td.get(self.text_response_key)
+        if prompt_txt is None or response_txt is None:
+            if prompt_txt is not None and response_txt is not None:
+                raise ValueError(
+                    "No text or history provided to the TransformersWrapper. Either both are provided or none of them."
+                )
             # Fallback on history parsing
             history = td.get(self.history_key)
             if history is None:
                 raise ValueError(
                     "No text or history provided to the TransformersWrapper."
                 )
-            prompt_txt = history.apply_chat_template(self.tokenizer)
+            tokenizer_kwargs = {}
+            if self.chat_template_name is not None:
+                tokenizer_kwargs.setdefault(
+                    "chat_template_name", self.chat_template_name
+                )
+            if self.chat_template is not None:
+                tokenizer_kwargs.setdefault("chat_template", self.chat_template)
+            tokenizer_kwargs.setdefault("add_generation_prompt", False)
+            response_txt = history.apply_chat_template(
+                tokenizer=self.tokenizer, **tokenizer_kwargs
+            )
+            if isinstance(response_txt, list):
+                prompt_txt = ["" for _ in response_txt]
+            else:
+                prompt_txt = ""
         if not isinstance(prompt_txt, (list, str)):
             prompt_txt = prompt_txt.tolist()
-        response_txt = td.get(self.text_response_key)
         if not isinstance(response_txt, (list, str)):
             response_txt = response_txt.tolist()
         total_txt = [x + y for x, y in _zip_strict(prompt_txt, response_txt)]
@@ -450,6 +487,8 @@ class TransformersWrapper(CategoricalSequential):
         )
         sequences = [
             _total_input_ids[_prompt_input_ids.shape[-1] :]
+            if _prompt_input_ids.shape[-1] > 0
+            else _total_input_ids
             for _total_input_ids, _prompt_input_ids in zip(
                 total_input_ids, prompt_input_ids
             )
@@ -484,7 +523,7 @@ class TransformersWrapper(CategoricalSequential):
         total_input_ids = [
             torch.cat([_prompt_input_ids, _response_input_ids], -1)
-            for _prompt_input_ids, _response_input_ids in zip(
+            for _prompt_input_ids, _response_input_ids in _zip_strict(
                 prompt_input_ids, response_input_ids
             )
         ]
@@ -512,7 +551,7 @@ class TransformersWrapper(CategoricalSequential):
             total_input_ids, attention_mask=total_attention_mask, **kwargs
         )
         log_probs, logits = self._log_probs_from_logits(
-            total_tokens_out, response_input_ids, pad_val=pad_val
+            total_tokens_out, response_input_ids, pad_val=-100
         )
         # for i in range(log_probs.size(0)):
         #     assert log_probs[i].shape[-1] == response_input_ids[i].shape[-1]
@@ -522,7 +561,7 @@ class TransformersWrapper(CategoricalSequential):
         return out
     @classmethod
-    def _log_probs_from_logits(cls, total_tokens_out, response_input_ids, pad_val):
+    def _log_probs_from_logits(cls, total_tokens_out, response_input_ids, pad_val=-100):
         response_input_ids = pad_sequence(
             response_input_ids,
             padding_value=pad_val,
@@ -532,10 +571,21 @@ class TransformersWrapper(CategoricalSequential):
         pad_mask = response_input_ids != pad_val
         logits = total_tokens_out["logits"]
-        logits = logits.log_softmax(dim=-1)
-        logits = logits[:, -response_input_ids.shape[-1] - 1 : -1, :]
-        log_probs = logits.gather(-1, response_input_ids.unsqueeze(-1)).squeeze(-1)
+        # logits = logits.log_softmax(dim=-1)
+        if logits.shape[-2] != response_input_ids.shape[-1]:
+            logits = logits[..., -response_input_ids.shape[-1] - 1 : -1, :]
+        td = TensorDict(
+            logits=logits, response_input_ids=response_input_ids
+        ).auto_batch_size_()
+        with td.flatten() as tdflat:
+            tdflat["log_probs"] = -torch.nn.functional.cross_entropy(
+                tdflat["logits"],
+                tdflat["response_input_ids"],
+                reduce=False,
+                ignore_index=pad_val,
+            )
+        log_probs = td["log_probs"]
         # Recover the list
         log_probs = _unpad_tensors(log_probs, pad_mask)
@@ -543,7 +593,7 @@ class TransformersWrapper(CategoricalSequential):
         return log_probs, logits
     @classmethod
-    def _log_probs_generate(cls, sequences, logits, pad_val):
+    def _log_probs_generate(cls, sequences, logits, pad_val=-100):
         tokens = pad_sequence(
             sequences,
             padding_value=pad_val,
@@ -557,6 +607,12 @@ class TransformersWrapper(CategoricalSequential):
             padding_side="left",
         )
-        logits = logits.log_softmax(dim=-1)
-        log_probs = logits.gather(-1, tokens.unsqueeze(-1)).squeeze(-1)
+        # logits = logits.log_softmax(dim=-1)
+        # log_probs = logits.gather(-1, tokens.unsqueeze(-1)).squeeze(-1)
+        td = TensorDict(logits=logits, tokens=tokens).auto_batch_size_()
+        with td.flatten() as tdflat:
+            tdflat["log_probs"] = -torch.nn.functional.cross_entropy(
+                tdflat["logits"], tdflat["tokens"], reduce=False, ignore_index=pad_val
+            )
+        log_probs = td["log_probs"]
         return log_probs, logits

torchrl/modules/llm/policies/vllm_wrapper.py CHANGED Viewed

@@ -74,6 +74,9 @@ class vLLMWrapper(CategoricalSequential):
             conserve type, batch-size, and device). Defaults to `True` when generating a single sample, `False`
             otherwise.
+        chat_template_name (str | None, optional): The name of the chat template to use for the history. Defaults to `None`.
+        chat_template (str | None, optional): The chat template to use for the history. Defaults to `None`.
     .. note:: The tokenizer is used when `from_text` is `True` to convert input text into token sequences. It is also
         required (or retrieved) when `pad_output` is `True` or when using text inputs with `generate=False` to ensure proper
         tokenization and padding.
@@ -120,6 +123,7 @@ class vLLMWrapper(CategoricalSequential):
     token_response_key: NestedKey = ("tokens_response",)
     text_response_key: NestedKey = ("text_response",)
     attention_mask_key: NestedKey = ("attention_mask",)
+    history_key: NestedKey = ("history",)
     def __init__(
         self,
@@ -137,6 +141,8 @@ class vLLMWrapper(CategoricalSequential):
         tokenizer_kwargs: dict | None = None,
         pad_output: bool = False,
         inplace: Literal[True, False, "empty"] | None = None,
+        chat_template_name: str | None = None,
+        chat_template: str | None = None,
     ):
         super().__init__()
@@ -149,6 +155,8 @@ class vLLMWrapper(CategoricalSequential):
         self._device = device
         self.generate = generate
         self.pad_output = pad_output
+        self.chat_template_name = chat_template_name
+        self.chat_template = chat_template
         padding_value = None
         if not tokenizer_kwargs:
@@ -329,7 +337,12 @@ class vLLMWrapper(CategoricalSequential):
             history = td.get(self.history_key)
             if history is None:
                 raise ValueError("No text or history provided to the vLLMWrapper.")
-            text = history.apply_chat_template(self.tokenizer)
+            tokenizer_kwargs = {}
+            if self.chat_template_name is not None:
+                tokenizer_kwargs["chat_template_name"] = self.chat_template_name
+            if self.chat_template is not None:
+                tokenizer_kwargs["chat_template"] = self.chat_template
+            text = history.apply_chat_template(self.tokenizer, **tokenizer_kwargs)
         if self.pad_output:
             tokenizer_kwargs = self.tokenizer_kwargs
             if not isinstance(text, (list, str)):
@@ -385,15 +398,35 @@ class vLLMWrapper(CategoricalSequential):
     def _from_vllm_logprobs_text(self, td, sampling_params, out):
         text_prompt = td.get(self.text_key)
-        if text_prompt is None:
+        text_response = td.get(self.text_response_key)
+        if text_response is None or text_prompt is None:
+            if text_response is not None and text_prompt is not None:
+                raise ValueError(
+                    "No text or history provided to the vLLMWrapper. Either both are provided or none of them."
+                )
             # Fallback on history parsing
             history = td.get(self.history_key)
             if history is None:
-                raise ValueError("No text or history provided to the vLLMWrapper.")
-            text_prompt = history.apply_chat_template(self.tokenizer)
+                raise ValueError(
+                    "No text or history provided to the TransformersWrapper."
+                )
+            tokenizer_kwargs = {}
+            if self.chat_template_name is not None:
+                tokenizer_kwargs.setdefault(
+                    "chat_template_name", self.chat_template_name
+                )
+            if self.chat_template is not None:
+                tokenizer_kwargs.setdefault("chat_template", self.chat_template)
+            tokenizer_kwargs.setdefault("add_generation_prompt", False)
+            text_response = history.apply_chat_template(
+                tokenizer=self.tokenizer, **tokenizer_kwargs
+            )
+            if isinstance(text_response, list):
+                text_prompt = ["" for _ in text_response]
+            else:
+                text_prompt = ""
         if not isinstance(text_prompt, list):
             text_prompt = text_prompt.tolist()
-        text_response = td.get(self.text_response_key)
         if not isinstance(text_response, list):
             text_response = text_response.tolist()
         text = [_x + _y for _x, _y in _zip_strict(text_prompt, text_response)]

torchrl/objectives/llm/__init__.py CHANGED Viewed

@@ -5,5 +5,6 @@
 from __future__ import annotations
 from .grpo import GRPOLoss, GRPOLossOutput, MCAdvantage
+from .sft import SFTLoss, SFTLossOutput
-__all__ = ["GRPOLoss", "GRPOLossOutput", "MCAdvantage"]
+__all__ = ["GRPOLoss", "GRPOLossOutput", "MCAdvantage", "SFTLoss", "SFTLossOutput"]