PyPI - torchrl-nightly - Versions diffs - 2025.6.20__cp310-cp310-macosx_10_9_universal2.whl → 2025.6.21__cp310-cp310-macosx_10_9_universal2.whl - Mend

torchrl-nightly 2025.6.20__cp310-cp310-macosx_10_9_universal2.whl → 2025.6.21__cp310-cp310-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

torchrl/_torchrl.cpython-310-darwin.so +0 -0
torchrl/collectors/collectors.py +8 -5
torchrl/collectors/llm/base.py +13 -6
torchrl/collectors/llm/ray_collector.py +3 -0
torchrl/data/__init__.py +2 -0
torchrl/data/llm/__init__.py +2 -0
torchrl/data/llm/chat.py +59 -8
torchrl/data/llm/topk.py +186 -0
torchrl/data/replay_buffers/ray_buffer.py +15 -1
torchrl/data/replay_buffers/replay_buffers.py +50 -11
torchrl/data/replay_buffers/samplers.py +98 -21
torchrl/data/replay_buffers/storages.py +29 -2
torchrl/envs/llm/__init__.py +2 -0
torchrl/envs/llm/chat.py +4 -1
torchrl/envs/llm/reward/gsm8k.py +15 -8
torchrl/envs/llm/transforms/__init__.py +2 -1
torchrl/envs/llm/transforms/kl.py +240 -4
torchrl/modules/llm/policies/transformers_wrapper.py +71 -15
torchrl/modules/llm/policies/vllm_wrapper.py +38 -5
torchrl/objectives/llm/__init__.py +2 -1
torchrl/objectives/llm/sft.py +465 -0
torchrl/version.py +2 -2
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.21.dist-info}/METADATA +1 -1
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.21.dist-info}/RECORD +27 -25
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.21.dist-info}/LICENSE +0 -0
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.21.dist-info}/WHEEL +0 -0
{torchrl_nightly-2025.6.20.dist-info → torchrl_nightly-2025.6.21.dist-info}/top_level.txt +0 -0

torchrl/_torchrl.cpython-310-darwin.so CHANGED Viewed

Binary file

torchrl/collectors/collectors.py CHANGED Viewed

@@ -352,8 +352,8 @@ class DataCollectorBase(IterableDataset, metaclass=abc.ABCMeta):
                 self._iterator = iter(self)
             out = next(self._iterator)
             # if any, we don't want the device ref to be passed in distributed settings
-            if out is not None:
-                out.clear_device_()
+            if out is not None and (out.device != "cpu"):
+                out = out.copy().clear_device_()
             return out
         except StopIteration:
             return None
@@ -892,7 +892,10 @@ class SyncDataCollector(DataCollectorBase):
             and hasattr(self.postproc, "to")
             and self.storing_device
         ):
-            self.postproc.to(self.storing_device)
+            postproc = self.postproc.to(self.storing_device)
+            if postproc is not self.postproc and postproc is not None:
+                self.postproc = postproc
         if frames_per_batch % self.n_env != 0 and RL_WARNINGS:
             warnings.warn(
                 f"frames_per_batch ({frames_per_batch}) is not exactly divisible by the number of batched environments ({self.n_env}), "
@@ -1253,9 +1256,9 @@ class SyncDataCollector(DataCollectorBase):
                     yield
                     continue
                 self._increment_frames(tensordict_out.numel())
-                if self.verbose:
-                    torchrl_logger.info("Collector: postproc.")
                 tensordict_out = self._postproc(tensordict_out)
+                if self.verbose:
+                    torchrl_logger.info("Collector: postproc done.")
                 if self.return_same_td:
                     # This is used with multiprocessed collectors to use the buffers
                     # stored in the tensordict.

torchrl/collectors/llm/base.py CHANGED Viewed

@@ -242,6 +242,11 @@ class LLMCollector(SyncDataCollector):
         else:
             self.policy_version_tracker = None
+    def set_postproc(self, postproc: Callable[[TensorDictBase], TensorDictBase]):
+        if self.postproc is not None:
+            raise RuntimeError("Postproc already set")
+        self.postproc = postproc
     def increment_version(self):
         """Increment the policy version."""
         if self.policy_version_tracker is not None:
@@ -361,9 +366,10 @@ class LLMCollector(SyncDataCollector):
                         )
                     self._yield_queues[idx].clear()
         result = self._trajectory_queue.popleft()
-        torchrl_logger.info(
-            f"LLMCollector: Yielding completed trajectory with shape {result.shape}."
-        )
+        if self.verbose:
+            torchrl_logger.info(
+                f"LLMCollector: Yielding completed trajectory with shape {result.shape}."
+            )
         return result
     started = False
@@ -422,9 +428,10 @@ class LLMCollector(SyncDataCollector):
                 self.env.async_step_and_maybe_reset_send(env_input)
         result = self._trajectory_queue.popleft()
-        torchrl_logger.info(
-            f"LLMCollector: Yielding completed trajectory with shape {result.shape}."
-        )
+        if self.verbose:
+            torchrl_logger.info(
+                f"LLMCollector: Yielding completed trajectory with shape {result.shape}."
+            )
         return result
     as_remote = as_remote

torchrl/collectors/llm/ray_collector.py CHANGED Viewed

@@ -134,6 +134,9 @@ class RayLLMCollector(LLMCollector):
             verbose=verbose,
         )
+    def set_postproc(self, postproc: Callable[[TensorDictBase], TensorDictBase]):
+        return ray.get(self._collector.set_postproc.remote(postproc))
     def _next_remote(self) -> None:
         return self._collector.next.remote()

torchrl/data/__init__.py CHANGED Viewed

@@ -17,6 +17,7 @@ from .llm import (
     RolloutFromModel,
     TensorDictTokenizer,
     TokenizedDatasetLoader,
+    TopKRewardSelector,
 )
 from .map import (
     BinaryToDecimal,
@@ -116,6 +117,7 @@ __all__ = [
     "Categorical",
     "Choice",
     "ContentBase",
+    "TopKRewardSelector",
     "Composite",
     "CompositeSpec",
     "ConstantKLController",

torchrl/data/llm/__init__.py CHANGED Viewed

@@ -13,6 +13,7 @@ from .dataset import (
 )
 from .prompt import PromptData, PromptTensorDictTokenizer
 from .reward import PairwiseDataset, RewardData
+from .topk import TopKRewardSelector
 from .utils import AdaptiveKLController, ConstantKLController, RolloutFromModel
 __all__ = [
@@ -30,4 +31,5 @@ __all__ = [
     "TokenizedDatasetLoader",
     "create_infinite_iterator",
     "get_dataloader",
+    "TopKRewardSelector",
 ]

torchrl/data/llm/chat.py CHANGED Viewed

@@ -11,18 +11,27 @@ from typing import Literal
 import torch
-from tensordict import lazy_stack, LazyStackedTensorDict, list_to_stack, TensorClass
+from tensordict import (
+    lazy_stack,
+    LazyStackedTensorDict,
+    list_to_stack,
+    TensorClass,
+    TensorDict,
+)
 from tensordict.utils import _maybe_correct_neg_dim
 from torchrl._utils import logger as torchrl_logger
 _CHAT_TEMPLATES = {
     "chatml_format": """{% for message in messages %}
+    {%- if message['role'] == 'assistant' %}
+    {% generation %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endgeneration %}
+    {%- else %}
     {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
+    {%- endif %}
 {% endfor %}
 {%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
+    {% generation %}{{- '<|im_start|>assistant\n' }}{% endgeneration %}
 {%- endif %}
 """,
     "qwen": """
@@ -282,7 +291,7 @@ class History(TensorClass["nocast"]):
         Keyword Args:
             tokenizer (transformers.PreTrainedTokenizer | transformers.AutoProcessor): The tokenizer to use.
-            add_generation_prompt (bool, optional): Whether to add a generation prompt. Defaults to `True`.
+            add_generation_prompt (bool, optional): Whether to add a generation prompt (e.g. `"<|im_start|>assistant"`). Defaults to `True`.
             chat_template (str, optional): The chat template to use. Defaults to the tokenizer's default template.
             chat_template_name (Literal["chatml_format", "qwen"], optional): The name of the chat template to use.
                 Prevalent over `tokenizer.chat_template`. Defaults to `None`.
@@ -293,6 +302,7 @@ class History(TensorClass["nocast"]):
             return_tensors (str | None, optional): The type of tensors to return. Defaults to "pt".
             return_dict (bool, optional): Whether to return a dictionary. Defaults to `False`.
             return_assistant_tokens_mask (bool, optional): Whether to return a mask of the assistant generated tokens.
+                If `True`, the mask will be written to the `assistant_masks` key.
                 For tokens generated by the assistant, the mask will contain `1`.
                 For user and system tokens, the mask will contain `0`.
                 This functionality is only available for chat templates that support it via the `{% generation %}` keyword.
@@ -315,6 +325,11 @@ class History(TensorClass["nocast"]):
                 raise RuntimeError(
                     "You must specify a tokenizer to use when chat_template is not specified."
                 )
+            elif "qwen" in getattr(tokenizer, "name_or_path", "").lower():
+                # We prefer our implementation of the Qwen template,
+                #  since it accounts for the assistant's masking.
+                chat_template = _CHAT_TEMPLATES["qwen"]
+                chat_template_name = None
             else:
                 chat_template = tokenizer.chat_template
         if chat_template is None:
@@ -333,7 +348,7 @@ class History(TensorClass["nocast"]):
             return_dict = False
         if self.ndim > 1:
-            return [
+            result = [
                 self[i].apply_chat_template(
                     tokenizer=tokenizer,
                     add_generation_prompt=add_generation_prompt,
@@ -350,12 +365,16 @@ class History(TensorClass["nocast"]):
                 )
                 for i in range(self.batch_size[0])
             ]
+            if return_dict:
+                return lazy_stack(result)
+            else:
+                return result
         self_flat = self.view(-1)
         # tolist_first=True is needed to avoid having a list of dict of dicts, but a list of dicts of lists of dicts
         self_flat = self_flat.tolist(tolist_first=True)
         # Remove the "<none>" role
         self_flat = [item for item in self_flat if item["role"] != "<none>"]
-        return tokenizer.apply_chat_template(
+        result = tokenizer.apply_chat_template(
             conversation=self_flat,
             add_generation_prompt=add_generation_prompt,
             chat_template=chat_template,
@@ -368,6 +387,16 @@ class History(TensorClass["nocast"]):
             return_assistant_tokens_mask=return_assistant_tokens_mask,
             **kwargs,
         )
+        if not isinstance(result, (torch.Tensor, list, str)):
+            result = TensorDict.from_dict(result, auto_batch_size=True, batch_dims=1)
+            # If self has a batch_dims of 1, we have just the time dimension, so we need to remove the batch dim from the result
+            if self.batch_dims == 1:
+                if result.batch_size[0] != 1:
+                    raise RuntimeError(
+                        f"Expected a batch size of 1, got {result.batch_size[0]}."
+                    )
+                result = result.squeeze(0)
+        return result
     @classmethod
     def from_text(
@@ -375,10 +404,20 @@ class History(TensorClass["nocast"]):
         text: str | list[str],
         chat_template_name: Literal["chatml_format", "qwen"] | None = None,
         chat_template: str | None = None,
+        tokenizer: transformers.AutoTokenizer  # noqa: F821
+        | transformers.AutoProcessor  # noqa: F821
+        | None = None,
     ) -> History:
-        if chat_template_name in ("chatml_format", None):
+        if chat_template_name is None and chat_template is None:
+            if "qwen" in getattr(tokenizer, "name_or_path", "").lower():
+                # We can automatically detect the template name from the tokenizer
+                #  and use the precoded parser.
+                chat_template_name = "qwen"
+            else:
+                chat_template_name = "chatml_format"
+        elif chat_template_name in ("chatml_format",):
             func = cls._inv_chatml
-        elif chat_template_name == "qwen":
+        elif chat_template_name in ("qwen",):
             func = cls._inv_qwen
         else:
             raise NotImplementedError(
@@ -735,3 +774,15 @@ class History(TensorClass["nocast"]):
         }
         return Composite(defaults, shape=shape[:-1], data_cls=cls)
+    @classmethod
+    def from_chats(cls, chats: list[list[dict]]) -> History:
+        """Create a History object from a list of chats.
+        Args:
+            chats (list[list[dict]]): A list of chats, where each chat is a list of dictionaries.
+        """
+        if isinstance(chats[0], dict):
+            return lazy_stack([cls(**chat) for chat in chats])
+        else:
+            return lazy_stack([cls.from_chats(chat) for chat in chats])

torchrl/data/llm/topk.py ADDED Viewed

@@ -0,0 +1,186 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+from collections import defaultdict, deque
+from typing import Any
+import torch
+from tensordict import NestedKey, TensorDictBase
+from torchrl._utils import logger as torchrl_logger
+from torchrl.envs.transforms import Transform
+class TopKRewardSelector(Transform):
+    """A replay-buffer transform that selects the top-k rewards for each prompt.
+    Args:
+        total_dialog_turns (int): Number of dialog turns to keep in memory for the top-k selection.
+        topk_size (int): Number of top-k rewards to select. Must be smaller than or equal to total_dialog_turns.
+        prompt_key (NestedKey): Key to the prompt in the tensordict. Defaults to "text".
+        rewards_key (NestedKey): Key to the rewards in the tensordict. Defaults to ("next", "reward").
+        done_key (NestedKey): Key to the done state in the tensordict. Defaults to ("next", "done").
+        verbose (bool): Whether to print verbose information. Defaults to `False`.
+    Example:
+        >>> from torchrl.data import ReplayBuffer, LazyStackStorage, SamplerWithoutReplacement
+        >>> from tensordict import TensorDict, lazy_stack
+        >>> import torch
+        >>> from torchrl.data.llm.topk import TopKRewardSelector
+        >>> # Create a replay buffer with 50 items, a sampler that samples without replacement, and a batch size of 5
+        >>> rb = ReplayBuffer(
+        ...     storage=LazyStackStorage(50),
+        ...     sampler=SamplerWithoutReplacement,
+        ...     batch_size=5,
+        ... )
+        >>> # Create a tensordict with 50 items, each with 10 dialog turns
+        >>> td = lazy_stack(
+        ...     [
+        ...         TensorDict(
+        ...             {
+        ...                 ("next", "done"): torch.full((1, 1), True),
+        ...                 # Reward for i+5 tokens
+        ...                 ("next", "reward"): torch.full((i + 5, 1), i),
+        ...                 # total of 10 dialogs per prompt
+        ...                 "text": f"Prompt {i // 5}",
+        ...             }
+        ...         )
+        ...         for i in range(50)
+        ...     ]
+        ... )
+        >>> # Create a top-k reward selector with 5 dialog turns and a top-k size of 3
+        >>> topk = TopKRewardSelector(total_dialog_turns=5, topk_size=3)
+        >>> rb.append_transform(topk)
+        >>> for _td in td.chunk(25):
+        ...     rb.extend(_td)
+        >>> # Only wrote top3 of 50 items in 10 groups of 5
+        >>>  assert rb.write_count == 30
+        >>> assert len(rb) == 30
+        >>> r3 = rb[:3].get(("next", "reward"), as_padded_tensor=True).squeeze()
+        >>> # 0 and 1 are missing because they're not part of the top-k
+        >>> assert (
+        ...     r3 == torch.tensor(
+        ...         [
+        ...             [4, 4, 4, 4, 4, 4, 4, 4, 4],
+        ...             [3, 3, 3, 3, 3, 3, 3, 3, 0],
+        ...             [2, 2, 2, 2, 2, 2, 2, 0, 0],
+        ...         ]
+        ...     )
+        ... ).all()
+    """
+    def __init__(
+        self,
+        total_dialog_turns: int,
+        topk_size: int,
+        prompt_key: NestedKey = "text",
+        rewards_key: NestedKey = ("next", "reward"),
+        done_key: NestedKey = ("next", "done"),
+        verbose: bool = True,
+    ):
+        super().__init__()
+        self.in_keys = [prompt_key, rewards_key, done_key]
+        self.prompt_key = prompt_key
+        self.rewards_key = rewards_key
+        self.done_key = done_key
+        self.queues = defaultdict(lambda: deque(maxlen=total_dialog_turns))
+        self.total_dialog_turns = total_dialog_turns
+        self.topk_size = topk_size
+        if topk_size > total_dialog_turns:
+            raise ValueError(
+                f"topk_size must be smaller than or equal to total_dialog_turns, got {topk_size=} and {total_dialog_turns=}"
+            )
+        self.verbose = verbose
+    def forward(self, tensordict: TensorDictBase) -> Any:
+        return tensordict
+    def _inv_call(self, tensordict: TensorDictBase) -> TensorDictBase:
+        # Tensordict can be any number of dims, but it must contain entire trajectories
+        if tensordict.ndim == 1:
+            # Check how many done states we have
+            num_done = tensordict[self.done_key].sum()
+            if num_done > 1:
+                done_idx = tensordict[self.done_key].nonzero(as_tuple=True)[0] + 1
+                splits = torch.cat([done_idx.new_zeros((1,)), done_idx], dim=0).diff()
+                tensordicts = tensordict.split(splits)
+                tensordicts = [self._inv_call(td) for td in tensordicts]
+                tensordicts = [td for td in tensordicts if td is not None]
+                return torch.cat(tensordicts) if tensordicts else None
+            # Then we have a single trajectory
+            if not tensordict[-1][self.done_key].all():
+                raise RuntimeError("Expected the trajectory to be done.")
+            prompt = tensordict[0][self.prompt_key]
+            if not isinstance(prompt, str):
+                raise TypeError(f"Expected a string as prompt, got {type(prompt)=}")
+            self.queues[prompt].append(tensordict)
+            if len(self.queues[prompt]) == self.total_dialog_turns:
+                if self.verbose:
+                    torchrl_logger.info(f"Getting top-k rewards for {prompt=}")
+                # Cat is the most robust way to combine the trajs
+                tds = torch.cat(list(self.queues[prompt]), -1)
+                # Collect rewards
+                reward = tds.get(self.rewards_key, as_nested_tensor=True)
+                reward = self._aggregate_rewards(reward)
+                # Check if all rewards are equal
+                if (reward == reward[0]).all():
+                    # If all rewards are equal, we can't select top-k
+                    if self.verbose:
+                        torchrl_logger.warning(
+                            f"All rewards are equal ({reward.unique()=})"
+                        )
+                    return
+                # Filter out rewards below median
+                median_reward = reward.median(dim=-1, keepdim=True)[0]
+                mask = reward > median_reward
+                filtered_reward = reward[mask]
+                filtered_indices = mask.nonzero(as_tuple=True)[0]
+                # Get top-k from filtered rewards
+                topk_reward = filtered_reward.topk(
+                    k=min(self.topk_size, len(filtered_indices)), dim=-1
+                )
+                if not topk_reward.indices.numel():
+                    if self.verbose:
+                        torchrl_logger.warning(
+                            f"No top-{self.topk_size} rewards found ({reward=})"
+                        )
+                    return
+                # Map back to original indices
+                selected_indices = filtered_indices[topk_reward.indices]
+                tds = tds[selected_indices]
+                if self.verbose:
+                    torchrl_logger.info(
+                        f"Selected top-{self.topk_size} rewards, with reward {topk_reward.values=}"
+                    )
+                return tds
+            return
+        elif tensordict.ndim > 2:
+            # keep the time dim at the end
+            tensordict = tensordict.flatten(0, -2)
+        trajs = tensordict.unbind(-1)
+        # Iterate over the trajectories
+        result = []
+        for traj in trajs:
+            td_out = self._inv_call(traj)
+            if td_out is None:
+                continue
+            result.append(td_out)
+        if result:
+            return torch.cat(result, -1)
+        return
+    def _aggregate_rewards(self, reward: torch.Tensor) -> torch.Tensor:
+        """Aggregate the rewards across the dialog turns.
+        `reward` is expected to be a nested tensor.
+        The default implementation is to take the mean of the rewards across the dialog turns.
+        """
+        # reward = reward.to_padded_tensor(padding=0.0)
+        if reward.ndim < 2 or reward.ndim > 3:
+            raise ValueError(
+                f"Expected reward to be a 2D or 3D tensor, got {reward.ndim}D tensor"
+            )
+        return reward.mean(dim=-2).squeeze(-1)

torchrl/data/replay_buffers/ray_buffer.py CHANGED Viewed

@@ -54,9 +54,12 @@ class RayReplayBuffer(ReplayBuffer):
     """A Ray implementation of the Replay Buffer that can be extended and sampled remotely.
     Keyword Args:
+        replay_buffer_cls (type[ReplayBuffer], optional): the class to use for the replay buffer.
+            Defaults to :class:`~torchrl.data.ReplayBuffer`.
         ray_init_config (dict[str, Any], optiona): keyword arguments to pass to `ray.init()`.
         remote_config (dict[str, Any], optiona): keyword arguments to pass to `cls.as_remote()`.
             Defaults to `torchrl.collectors.distributed.ray.DEFAULT_REMOTE_CLASS_CONFIG`.
+        **kwargs: keyword arguments to pass to the replay buffer class.
     .. seealso:: :class:`~torchrl.data.ReplayBuffer` for a list of other keyword arguments.
@@ -119,6 +122,7 @@ class RayReplayBuffer(ReplayBuffer):
     def __init__(
         self,
         *args,
+        replay_buffer_cls: type[ReplayBuffer] | None = ReplayBuffer,
         ray_init_config: dict[str, Any] | None = None,
         remote_config: dict[str, Any] | None = None,
         **kwargs,
@@ -134,7 +138,13 @@ class RayReplayBuffer(ReplayBuffer):
                 ray_init_config = DEFAULT_RAY_INIT_CONFIG
             ray.init(**ray_init_config)
-        remote_cls = ReplayBuffer.as_remote(remote_config).remote
+        remote_cls = replay_buffer_cls.as_remote(remote_config).remote
+        # We can detect if the buffer has a GPU allocated, if not
+        #  we'll make sure that the data is sent to CPU when needed.
+        if remote_config is not None:
+            self.has_gpu = remote_config.get("num_gpus", 0) > 0
+        else:
+            self.has_gpu = False
         self._rb = remote_cls(*args, **kwargs)
     def close(self):
@@ -158,6 +168,10 @@ class RayReplayBuffer(ReplayBuffer):
         return ray.get(pending_task)
     def extend(self, *args, **kwargs):
+        if not self.has_gpu:
+            # Move the data to GPU
+            args = [arg.to("cpu") for arg in args if hasattr(arg, "to")]
+            kwargs = {k: v.to("cpu") for k, v in kwargs.items() if hasattr(v, "to")}
         pending_task = self._rb.extend.remote(*args, **kwargs)
         return ray.get(pending_task)

torchrl/data/replay_buffers/replay_buffers.py CHANGED Viewed

@@ -702,7 +702,7 @@ class ReplayBuffer:
             self._sampler.add(index)
         return index
-    def _extend(self, data: Sequence) -> torch.Tensor:
+    def _extend(self, data: Sequence, *, update_priority: bool = True) -> torch.Tensor:
         is_comp = is_compiling()
         nc = contextlib.nullcontext()
         with self._replay_lock if not is_comp else nc, self._write_lock if not is_comp else nc:
@@ -712,7 +712,9 @@ class ReplayBuffer:
             self._sampler.extend(index)
         return index
-    def extend(self, data: Sequence) -> torch.Tensor:
+    def extend(
+        self, data: Sequence, *, update_priority: bool | None = None
+    ) -> torch.Tensor:
         """Extends the replay buffer with one or more elements contained in an iterable.
         If present, the inverse transforms will be called.`
@@ -721,6 +723,10 @@ class ReplayBuffer:
             data (iterable): collection of data to be added to the replay
                 buffer.
+        Keyword Args:
+            update_priority (bool, optional): Whether to update the priority of the data. Defaults to True.
+                Without effect in this class. See :meth:`~torchrl.data.TensorDictReplayBuffer.extend` for more details.
         Returns:
             Indices of the data added to the replay buffer.
@@ -735,12 +741,16 @@ class ReplayBuffer:
           unbound elements can be provided (no PyTrees).
         """
+        if update_priority is not None:
+            raise NotImplementedError(
+                "update_priority is not supported in this class. See :meth:`~torchrl.data.TensorDictReplayBuffer.extend` for more details."
+            )
         if self._transform is not None and len(self._transform):
             with _set_dispatch_td_nn_modules(is_tensor_collection(data)):
                 data = self._transform.inv(data)
         if data is None:
             return torch.zeros((0, self._storage.ndim), dtype=torch.long)
-        return self._extend(data)
+        return self._extend(data, update_priority=update_priority)
     def update_priority(
         self,
@@ -914,8 +924,8 @@ class ReplayBuffer:
                 self._iterator = iter(self)
             out = next(self._iterator)
             # if any, we don't want the device ref to be passed in distributed settings
-            if out is not None:
-                out.clear_device_()
+            if out is not None and (out.device != "cpu"):
+                out = out.copy().clear_device_()
             return out
         except StopIteration:
             self._iterator = None
@@ -1015,6 +1025,9 @@ class PrioritizedReplayBuffer(ReplayBuffer):
         storage (Storage, optional): the storage to be used. If none is provided
             a default :class:`~torchrl.data.replay_buffers.ListStorage` with
             ``max_size`` of ``1_000`` will be created.
+        sampler (Sampler, optional): the sampler to be used. If none is provided,
+            a default :class:`~torchrl.data.replay_buffers.PrioritizedSampler` with
+            ``alpha``, ``beta``, and ``eps`` will be created.
         collate_fn (callable, optional): merges a list of samples to form a
             mini-batch of Tensor(s)/outputs.  Used when using batched
             loading from a map-style dataset. The default value will be decided
@@ -1107,6 +1120,7 @@ class PrioritizedReplayBuffer(ReplayBuffer):
         eps: float = 1e-8,
         dtype: torch.dtype = torch.float,
         storage: Storage | None = None,
+        sampler: Sampler | None = None,
         collate_fn: Callable | None = None,
         pin_memory: bool = False,
         prefetch: int | None = None,
@@ -1116,7 +1130,8 @@ class PrioritizedReplayBuffer(ReplayBuffer):
     ) -> None:
         if storage is None:
             storage = ListStorage(max_size=1_000)
-        sampler = PrioritizedSampler(storage.max_size, alpha, beta, eps, dtype)
+        if sampler is None:
+            sampler = PrioritizedSampler(storage.max_size, alpha, beta, eps, dtype)
         super().__init__(
             storage=storage,
             sampler=sampler,
@@ -1347,7 +1362,20 @@ class TensorDictReplayBuffer(ReplayBuffer):
             self.update_tensordict_priority(data)
         return index
-    def extend(self, tensordicts: TensorDictBase) -> torch.Tensor:
+    def extend(
+        self, tensordicts: TensorDictBase, *, update_priority: bool | None = None
+    ) -> torch.Tensor:
+        """Extends the replay buffer with a batch of data.
+        Args:
+            tensordicts (TensorDictBase): The data to extend the replay buffer with.
+        Keyword Args:
+            update_priority (bool, optional): Whether to update the priority of the data. Defaults to True.
+        Returns:
+            The indices of the data that were added to the replay buffer.
+        """
         if not isinstance(tensordicts, TensorDictBase):
             raise ValueError(
                 f"{self.__class__.__name__} only accepts TensorDictBase subclasses. tensorclasses "
@@ -1365,8 +1393,17 @@ class TensorDictReplayBuffer(ReplayBuffer):
         #  is that just doing this results in indices that are not sorted like the original data
         #  so the actually indices will have to be used on the _storage directly (not on the buffer)
         self._set_index_in_td(tensordicts, index)
-        # TODO: in principle this is a good idea but currently it doesn't work + it re-writes a priority that has just been written
-        # self.update_tensordict_priority(tensordicts)
+        if update_priority is None:
+            update_priority = True
+        if update_priority:
+            try:
+                vector = tensordicts.get(self.priority_key)
+                if vector is not None:
+                    self.update_priority(index, vector)
+            except Exception as e:
+                raise RuntimeError(
+                    "Failed to update priority of extended data. You can try to set update_priority=False in the extend method and update the priority manually."
+                ) from e
         return index
     def _set_index_in_td(self, tensordict, index):
@@ -1685,8 +1722,10 @@ class RemoteTensorDictReplayBuffer(TensorDictReplayBuffer):
     def add(self, data: TensorDictBase) -> int:
         return super().add(data)
-    def extend(self, tensordicts: list | TensorDictBase) -> torch.Tensor:
-        return super().extend(tensordicts)
+    def extend(
+        self, tensordicts: list | TensorDictBase, *, update_priority: bool | None = None
+    ) -> torch.Tensor:
+        return super().extend(tensordicts, update_priority=update_priority)
     def update_priority(
         self, index: int | torch.Tensor, priority: int | torch.Tensor