PyPI - torchrl - Versions diffs - 0.11.0__cp313-cp313-win_amd64.whl → 0.11.1__cp313-cp313-win_amd64.whl - Mend

torchrl 0.11.0__cp313-cp313-win_amd64.whl → 0.11.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

sota-implementations/dreamer/dreamer.py +21 -5
sota-implementations/dreamer/dreamer_utils.py +160 -22
torchrl/_torchrl.cp313-win_amd64.pyd +0 -0
torchrl/collectors/_base.py +10 -0
torchrl/collectors/_multi_base.py +19 -5
torchrl/collectors/utils.py +2 -1
torchrl/data/replay_buffers/writers.py +2 -1
torchrl/data/tensor_specs.py +2 -2
torchrl/envs/batched_envs.py +16 -0
torchrl/envs/transforms/transforms.py +91 -0
torchrl/objectives/cql.py +31 -3
torchrl/objectives/crossq.py +31 -3
torchrl/objectives/decision_transformer.py +35 -8
torchrl/objectives/iql.py +45 -2
torchrl/objectives/redq.py +46 -10
torchrl/objectives/sac.py +69 -6
torchrl/weight_update/_shared.py +189 -44
{torchrl-0.11.0.dist-info → torchrl-0.11.1.dist-info}/METADATA +2 -1
{torchrl-0.11.0.dist-info → torchrl-0.11.1.dist-info}/RECORD +23 -23
{torchrl-0.11.0.dist-info → torchrl-0.11.1.dist-info}/LICENSE +0 -0
{torchrl-0.11.0.dist-info → torchrl-0.11.1.dist-info}/WHEEL +0 -0
{torchrl-0.11.0.dist-info → torchrl-0.11.1.dist-info}/entry_points.txt +0 -0
{torchrl-0.11.0.dist-info → torchrl-0.11.1.dist-info}/top_level.txt +0 -0

sota-implementations/dreamer/dreamer.py CHANGED Viewed

@@ -140,17 +140,21 @@ def main(cfg: DictConfig):  # noqa: F821
         buffer_size=buffer_size,
         buffer_scratch_dir=scratch_dir,
         device=device,
-        prefetch=prefetch if not profiling_enabled else None,
+        prefetch=prefetch,  # Always use prefetch for better throughput
         pixel_obs=cfg.env.from_pixels,
         grayscale=cfg.env.grayscale,
         image_size=cfg.env.image_size,
     )
     # Create storage transform for extend-time processing (applied once per frame)
+    # When GPU is available, GPUImageTransform handles image processing in the env,
+    # so we skip the heavy CPU transforms in storage_transform
+    gpu_transforms = device.type == "cuda"
     storage_transform = make_storage_transform(
         pixel_obs=cfg.env.from_pixels,
         grayscale=cfg.env.grayscale,
         image_size=cfg.env.image_size,
+        gpu_transforms=gpu_transforms,
     )
     # Create policy version tracker for async collection
@@ -247,7 +251,11 @@ def main(cfg: DictConfig):  # noqa: F821
         compile_warmup = 3
         torchrl_logger.info(f"Compiling loss modules with warmup={compile_warmup}")
         backend = compile_cfg.backend
-        mode = compile_cfg.mode
+        cudagraphs = compile_cfg.cudagraphs
+        # Build compile options - disable CUDA graphs if configured (default)
+        # CUDA graphs conflict with dynamic RSSM rollout loop
+        compile_options = {"triton.cudagraphs": cudagraphs}
         # Note: We do NOT compile rssm_prior/rssm_posterior here because they are
         # shared with the policy used in the collector. Compiling them would cause
@@ -260,17 +268,25 @@ def main(cfg: DictConfig):  # noqa: F821
             world_model_loss = compile_with_warmup(
                 world_model_loss,
                 backend=backend,
-                mode=mode,
                 fullgraph=False,
                 warmup=compile_warmup,
+                options=compile_options,
             )
         if "actor" in compile_losses:
             actor_loss = compile_with_warmup(
-                actor_loss, backend=backend, mode=mode, warmup=compile_warmup
+                actor_loss,
+                backend=backend,
+                fullgraph=False,
+                warmup=compile_warmup,
+                options=compile_options,
             )
         if "value" in compile_losses:
             value_loss = compile_with_warmup(
-                value_loss, backend=backend, mode=mode, warmup=compile_warmup
+                value_loss,
+                backend=backend,
+                fullgraph=False,
+                warmup=compile_warmup,
+                options=compile_options,
             )
     else:
         compile_warmup = 0

sota-implementations/dreamer/dreamer_utils.py CHANGED Viewed

@@ -10,7 +10,7 @@ from contextlib import nullcontext
 import torch
 import torch.nn as nn
-from tensordict import NestedKey
+from tensordict import NestedKey, TensorDictBase
 from tensordict.nn import (
     InteractionType,
     ProbabilisticTensorDictModule,
@@ -38,7 +38,6 @@ from torchrl.envs import (
     DreamerEnv,
     EnvCreator,
     ExcludeTransform,
-    # ExcludeTransform,
     FrameSkipTransform,
     GrayScale,
     GymEnv,
@@ -50,6 +49,7 @@ from torchrl.envs import (
     StepCounter,
     TensorDictPrimer,
     ToTensorImage,
+    Transform,
     TransformedEnv,
 )
 from torchrl.envs.utils import check_env_specs, ExplorationType, set_exploration_type
@@ -260,6 +260,89 @@ class DreamerProfiler:
         return self.total_optim_steps >= target_steps
+class GPUImageTransform(Transform):
+    """Composite transform that processes images on GPU for faster execution.
+    This transform:
+    1. Moves pixels_int to GPU
+    2. Runs ToTensorImage (permute + divide by 255)
+    3. Optionally runs GrayScale
+    4. Runs Resize
+    5. Keeps output on GPU for fast policy inference
+    This avoids device mismatch issues by not using DeviceCastTransform on the
+    full tensordict - only the pixel processing happens on GPU.
+    """
+    def __init__(
+        self,
+        device: torch.device,
+        image_size: int,
+        grayscale: bool = False,
+        in_key: str = "pixels_int",
+        out_key: str = "pixels",
+    ):
+        super().__init__(in_keys=[in_key], out_keys=[out_key])
+        self.device = device
+        self.image_size = image_size
+        self.grayscale = grayscale
+        self.in_key = in_key
+        self.out_key = out_key
+    def _apply_transform(self, pixels_int: torch.Tensor) -> torch.Tensor:
+        # Move to GPU
+        pixels = pixels_int.to(self.device)
+        # ToTensorImage: permute W x H x C -> C x W x H and normalize
+        pixels = pixels.permute(*list(range(pixels.ndimension() - 3)), -1, -3, -2)
+        pixels = pixels.float().div(255)
+        # GrayScale
+        if self.grayscale:
+            pixels = pixels.mean(dim=-3, keepdim=True)
+        # Resize using interpolate
+        if pixels.shape[-2:] != (self.image_size, self.image_size):
+            # Add batch dim if needed for interpolate
+            needs_squeeze = pixels.ndim == 3
+            if needs_squeeze:
+                pixels = pixels.unsqueeze(0)
+            pixels = torch.nn.functional.interpolate(
+                pixels,
+                size=(self.image_size, self.image_size),
+                mode="bilinear",
+                align_corners=False,
+                antialias=True,
+            )
+            if needs_squeeze:
+                pixels = pixels.squeeze(0)
+        return pixels
+    def _reset(
+        self, tensordict: TensorDictBase, tensordict_reset: TensorDictBase
+    ) -> TensorDictBase:
+        return self._call(tensordict_reset)
+    def transform_observation_spec(self, observation_spec):
+        # Update the spec for the output key
+        # Note: Keep spec on CPU to match other specs in Composite
+        # The actual transform will put data on GPU, but spec device must be uniform
+        from torchrl.data import Unbounded
+        in_spec = observation_spec[self.in_key]
+        # Output shape: (C, H, W) where C=1 if grayscale else 3
+        out_channels = 1 if self.grayscale else 3
+        out_shape = (
+            *in_spec.shape[:-3],
+            out_channels,
+            self.image_size,
+            self.image_size,
+        )
+        # Use in_spec.device to maintain device consistency in Composite
+        out_spec = Unbounded(
+            shape=out_shape, dtype=torch.float32, device=in_spec.device
+        )
+        observation_spec[self.out_key] = out_spec
+        return observation_spec
 def _make_env(cfg, device, from_pixels=False):
     lib = cfg.env.backend
     if lib in ("gym", "gymnasium"):
@@ -294,22 +377,44 @@ def _make_env(cfg, device, from_pixels=False):
     return env
-def transform_env(cfg, env):
+def transform_env(cfg, env, device=None):
+    """Apply transforms to environment.
+    Args:
+        cfg: Config object
+        env: The environment to transform
+        device: If specified and is a CUDA device, use GPU-accelerated image
+            processing which is ~50-100x faster than CPU.
+    """
     if not isinstance(env, TransformedEnv):
         env = TransformedEnv(env)
     if cfg.env.from_pixels:
-        # transforms pixel from 0-255 to 0-1 (uint8 to float32)
+        # Rename original pixels for processing
         env.append_transform(
             RenameTransform(in_keys=["pixels"], out_keys=["pixels_int"])
         )
-        env.append_transform(
-            ToTensorImage(from_int=True, in_keys=["pixels_int"], out_keys=["pixels"])
-        )
-        if cfg.env.grayscale:
-            env.append_transform(GrayScale())
-        image_size = cfg.env.image_size
-        env.append_transform(Resize(image_size, image_size))
+        # Use GPU-accelerated image processing if device is CUDA
+        if device is not None and str(device).startswith("cuda"):
+            env.append_transform(
+                GPUImageTransform(
+                    device=device,
+                    image_size=cfg.env.image_size,
+                    grayscale=cfg.env.grayscale,
+                    in_key="pixels_int",
+                    out_key="pixels",
+                )
+            )
+        else:
+            # CPU fallback: use standard transforms
+            env.append_transform(
+                ToTensorImage(
+                    from_int=True, in_keys=["pixels_int"], out_keys=["pixels"]
+                )
+            )
+            if cfg.env.grayscale:
+                env.append_transform(GrayScale())
+            env.append_transform(Resize(cfg.env.image_size, cfg.env.image_size))
     env.append_transform(DoubleToFloat())
     env.append_transform(RewardSum())
@@ -329,24 +434,38 @@ def make_environments(cfg, parallel_envs=1, logger=None):
     """
     def train_env_factory():
-        """Factory function for creating training environments."""
-        func = functools.partial(
-            _make_env, cfg=cfg, device=_default_device(cfg.env.device)
-        )
+        """Factory function for creating training environments.
+        Note: This factory runs inside collector worker processes. We use
+        CUDA if available for GPU-accelerated image transforms (ToTensorImage,
+        Resize) which are ~50-100x faster than CPU. The cfg.env.device setting
+        is ignored in favor of auto-detecting CUDA availability.
+        """
+        # Use CUDA for transforms if available, regardless of cfg.env.device
+        # This is critical: image transforms (Resize, ToTensorImage) are ~50-100x
+        # faster on GPU. DMControl/Gym render on CPU, but we move to GPU for transforms.
+        transform_device = _default_device(None)  # Returns CUDA if available
+        # Base env still uses cfg.env.device for compatibility
+        env_device = _default_device(cfg.env.device)
+        func = functools.partial(_make_env, cfg=cfg, device=env_device)
         train_env = ParallelEnv(
             parallel_envs,
             EnvCreator(func),
             serial_for_single=True,
         )
-        train_env = transform_env(cfg, train_env)
+        # Pass transform_device to enable GPU-accelerated image transforms
+        train_env = transform_env(cfg, train_env, device=transform_device)
         train_env.set_seed(cfg.env.seed)
         return train_env
     # Create eval env directly (not a factory)
+    # Use CUDA for transforms if available, regardless of cfg.env.device
+    transform_device = _default_device(None)  # Returns CUDA if available
+    env_device = _default_device(cfg.env.device)
     func = functools.partial(
         _make_env,
         cfg=cfg,
-        device=_default_device(cfg.env.device),
+        device=env_device,
         from_pixels=cfg.logger.video,
     )
     eval_env = ParallelEnv(
@@ -354,7 +473,8 @@ def make_environments(cfg, parallel_envs=1, logger=None):
         EnvCreator(func),
         serial_for_single=True,
     )
-    eval_env = transform_env(cfg, eval_env)
+    # Pass transform_device to enable GPU-accelerated image transforms
+    eval_env = transform_env(cfg, eval_env, device=transform_device)
     eval_env.set_seed(cfg.env.seed + 1)
     if cfg.logger.video:
         eval_env.insert_transform(
@@ -681,15 +801,32 @@ def make_storage_transform(
     pixel_obs=True,
     grayscale=True,
     image_size,
+    gpu_transforms=False,
 ):
     """Create transforms to be applied at extend-time (once per frame).
-    These heavy transforms (ToTensorImage, GrayScale, Resize) are applied once
-    when data is added to the buffer, rather than on every sample.
+    Args:
+        pixel_obs: Whether observations are pixel-based.
+        grayscale: Whether to convert to grayscale.
+        image_size: Target image size.
+        gpu_transforms: If True, skip heavy image transforms (ToTensorImage,
+            GrayScale, Resize) since they're already applied by GPUImageTransform
+            in the environment. Only ExcludeTransform is applied to filter keys.
     """
     if not pixel_obs:
         return None
+    # When GPU transforms are enabled, GPUImageTransform already processes
+    # pixels_int -> pixels with normalization, grayscale, and resize.
+    # We only need to filter out the intermediate pixels_int key.
+    if gpu_transforms:
+        storage_transforms = Compose(
+            # Just exclude pixels_int, keep everything else including processed pixels
+            ExcludeTransform("pixels_int", ("next", "pixels_int")),
+        )
+        return storage_transforms
+    # CPU fallback: apply heavy transforms at storage time
     storage_transforms = Compose(
         ExcludeTransform("pixels", ("next", "pixels"), inverse=True),
         ToTensorImage(
@@ -741,7 +878,6 @@ def make_replay_buffer(
         )
         replay_buffer = TensorDictReplayBuffer(
-            pin_memory=False,
             prefetch=prefetch,
             storage=LazyMemmapStorage(
                 buffer_size,
@@ -755,7 +891,9 @@ def make_replay_buffer(
                 strict_length=False,
                 traj_key=("collector", "traj_ids"),
                 cache_values=False,  # Disabled for async collection (cache not synced across processes)
-                # Don't compile the sampler - inductor has C++ codegen bugs for int64 ops
+                use_gpu=device.type == "cuda"
+                if device is not None
+                else False,  # Speed up trajectory computation on GPU
             ),
             transform=sample_transforms,
             batch_size=batch_size,

torchrl/_torchrl.cp313-win_amd64.pyd CHANGED Viewed

Binary file

torchrl/collectors/_base.py CHANGED Viewed

@@ -585,6 +585,14 @@ class BaseCollector(IterableDataset, metaclass=abc.ABCMeta):
             ...     "actor": actor_weights,
             ...     "critic": critic_weights,
             ... })
+            >>>
+            >>> # Per-worker weight updates (for distinct policy factories)
+            >>> # Each worker can have independently updated weights
+            >>> collector.update_policy_weights_({
+            ...     0: worker_0_weights,
+            ...     1: worker_1_weights,
+            ...     2: worker_2_weights,
+            ... })
         Args:
             policy_or_weights: The weights to update with. Can be:
@@ -593,6 +601,8 @@ class BaseCollector(IterableDataset, metaclass=abc.ABCMeta):
                 - ``TensorDictModuleBase``: A TensorDict module whose weights will be extracted
                 - ``TensorDictBase``: A TensorDict containing weights
                 - ``dict``: A regular dict containing weights
+                - ``dict[int, TensorDictBase]``: Per-worker weights where keys are worker indices.
+                  This is used with distinct policy factories where each worker has independent weights.
                 - ``None``: Will try to get weights from server using ``_get_server_weights()``
         Keyword Args:

torchrl/collectors/_multi_base.py CHANGED Viewed

@@ -429,16 +429,30 @@ class MultiCollector(BaseCollector, metaclass=_MultiCollectorMeta):
             raise TypeError(
                 "Cannot specify both weight_sync_schemes and weight_updater."
             )
+        # Check if policy_factory entries are all the same (replicated from single factory)
+        # vs different factories per worker.
+        has_uniform_policy_factory = any(policy_factory) and all(
+            f is policy_factory[0] for f in policy_factory
+        )
+        has_distinct_policy_factory = (
+            any(policy_factory) and not has_uniform_policy_factory
+        )
         if (
             weight_sync_schemes is not None
             and not weight_sync_schemes
             and weight_updater is None
-            and (isinstance(policy, nn.Module) or any(policy_factory))
         ):
-            # Set up a default local shared-memory sync scheme for the policy.
-            # This is used to propagate weights from the orchestrator policy
-            # (possibly combined with a policy_factory) down to worker policies.
-            weight_sync_schemes["policy"] = SharedMemWeightSyncScheme()
+            if isinstance(policy, nn.Module) or has_uniform_policy_factory:
+                # Set up a default local shared-memory sync scheme for the policy.
+                # This is used to propagate weights from the orchestrator policy
+                # (possibly combined with a policy_factory) down to worker policies.
+                weight_sync_schemes["policy"] = SharedMemWeightSyncScheme()
+            elif has_distinct_policy_factory:
+                # Distinct factories: set up per-worker weight sync scheme.
+                # Each worker maintains independent weights that can be updated individually.
+                weight_sync_schemes["policy"] = SharedMemWeightSyncScheme(
+                    per_worker_weights=True
+                )
         self._setup_multi_weight_sync(weight_updater, weight_sync_schemes)

torchrl/collectors/utils.py CHANGED Viewed

@@ -417,7 +417,8 @@ def _make_policy_factory(
         raise ValueError("policy cannot be used with policy_factory")
     elif has_policy_factory:
         if isinstance(policy_factory, Sequence):
-            return policy_factory
+            # Use worker_idx to get the correct factory for this worker
+            policy = policy_factory[worker_idx]()
         else:
             policy = policy_factory()

torchrl/data/replay_buffers/writers.py CHANGED Viewed

@@ -93,7 +93,8 @@ class Writer(ABC):
         )
         mesh = torch.stack(
             torch.meshgrid(
-                *(torch.arange(dim, device=device) for dim in self._storage.shape[1:])
+                *(torch.arange(dim, device=device) for dim in self._storage.shape[1:]),
+                indexing="ij",
             ),
             -1,
         ).flatten(0, -2)

torchrl/data/tensor_specs.py CHANGED Viewed

@@ -5601,7 +5601,7 @@ class Composite(TensorSpec):
         elif self.data_cls is not None:
             out = {}
         else:
-            out = TensorDict._new_unsafe({}, _size([]))
+            out = TensorDict._new_unsafe({}, self.shape)
         for key, item in vals.items():
             if item is None:
                 raise RuntimeError(
@@ -5644,7 +5644,7 @@ class Composite(TensorSpec):
         else:
             def empty(vals):
-                out = TensorDict._new_unsafe({}, _size([]))
+                out = TensorDict._new_unsafe({}, self.shape)
                 return vals, out
         funcs.append(empty)

torchrl/envs/batched_envs.py CHANGED Viewed

@@ -96,6 +96,22 @@ class _dispatch_caller_parallel:
         # if the object returned is not a callable
         return iter(self.__call__())
+    def __getattr__(self, name):
+        """Support chained attribute access: env_parallel.a.b -> sends ('a','b') to workers."""
+        # Don't chain special/dunder methods - these are often called by
+        # display systems (e.g., Jupyter's _repr_html_) and shouldn't be
+        # dispatched to workers
+        if name.startswith("_"):
+            raise AttributeError(
+                f"Accessing private/special attribute {name!r} is not supported "
+                f"on dispatched parallel env attributes."
+            )
+        if isinstance(self.attr, tuple):
+            new_attr = self.attr + (name,)
+        else:
+            new_attr = (self.attr, name)
+        return _dispatch_caller_parallel(new_attr, self.parallel_env)
 class _dispatch_caller_serial:
     def __init__(self, list_callable: list[Callable, Any]):

torchrl/envs/transforms/transforms.py CHANGED Viewed

@@ -7673,6 +7673,26 @@ class StepCounter(Transform):
         self._truncated_keys = truncated_keys
         return truncated_keys
+    @property
+    def all_truncated_keys(self) -> list[NestedKey]:
+        """Returns truncated keys for ALL reset keys (including nested ones).
+        Used for propagating truncated to nested agent-level keys in MARL envs.
+        """
+        all_truncated_keys = self.__dict__.get("_all_truncated_keys", None)
+        if all_truncated_keys is None:
+            all_truncated_keys = []
+            if self.parent is None:
+                return self.truncated_keys
+            for reset_key in self.parent.reset_keys:
+                if isinstance(reset_key, str):
+                    key = self.truncated_key
+                else:
+                    key = (*reset_key[:-1], self.truncated_key)
+                all_truncated_keys.append(key)
+        self.__dict__["_all_truncated_keys"] = all_truncated_keys
+        return all_truncated_keys
     @property
     def done_keys(self) -> list[NestedKey]:
         done_keys = self.__dict__.get("_done_keys", None)
@@ -7688,6 +7708,26 @@ class StepCounter(Transform):
         self.__dict__["_done_keys"] = done_keys
         return done_keys
+    @property
+    def all_done_keys(self) -> list[NestedKey]:
+        """Returns done keys for ALL reset keys (including nested ones).
+        Used for propagating done to nested agent-level keys in MARL envs.
+        """
+        all_done_keys = self.__dict__.get("_all_done_keys", None)
+        if all_done_keys is None:
+            all_done_keys = []
+            if self.parent is None:
+                return self.done_keys
+            for reset_key in self.parent.reset_keys:
+                if isinstance(reset_key, str):
+                    key = "done"
+                else:
+                    key = (*reset_key[:-1], "done")
+                all_done_keys.append(key)
+        self.__dict__["_all_done_keys"] = all_done_keys
+        return all_done_keys
     @property
     def terminated_keys(self) -> list[NestedKey]:
         terminated_keys = self.__dict__.get("_terminated_keys", None)
@@ -7803,8 +7843,59 @@ class StepCounter(Transform):
                     done = truncated | done  # we assume no done after reset
                     next_tensordict.set(done_key, done)
                 next_tensordict.set(truncated_key, truncated)
+        # Propagate truncated/done to nested agent-level keys in MARL envs
+        # This ensures that when max_steps is reached, all agent truncated/done keys are updated
+        if self.max_steps is not None:
+            self._propagate_to_nested_keys(next_tensordict)
         return next_tensordict
+    def _propagate_to_nested_keys(self, next_tensordict: TensorDictBase) -> None:
+        """Propagate truncated and done values to nested agent-level keys.
+        In MARL envs, there may be nested agent-level truncated/done keys that
+        are children of the root truncated/done. When StepCounter sets truncated
+        at the root level, we need to propagate this to nested keys.
+        """
+        # Get the set of keys we already updated (filtered keys)
+        updated_truncated = set(self.truncated_keys)
+        updated_done = set(self.done_keys)
+        # Propagate truncated to nested keys
+        for nested_key in self.all_truncated_keys:
+            if nested_key in updated_truncated:
+                continue
+            # Find the parent truncated key that should be propagated
+            nested_truncated = next_tensordict.get(nested_key, None)
+            if nested_truncated is None:
+                continue
+            # Find a parent truncated key to propagate from
+            for parent_key in self.truncated_keys:
+                parent_truncated = next_tensordict.get(parent_key, None)
+                if parent_truncated is not None:
+                    # Expand parent truncated to match nested shape and apply OR
+                    expanded = parent_truncated.expand_as(nested_truncated)
+                    next_tensordict.set(nested_key, nested_truncated | expanded)
+                    break
+        # Propagate done to nested keys if update_done is True
+        if self.update_done:
+            for nested_key in self.all_done_keys:
+                if nested_key in updated_done:
+                    continue
+                nested_done = next_tensordict.get(nested_key, None)
+                if nested_done is None:
+                    continue
+                # Find a parent done key to propagate from
+                for parent_key in self.done_keys:
+                    parent_done = next_tensordict.get(parent_key, None)
+                    if parent_done is not None:
+                        # Expand parent done to match nested shape and apply OR
+                        expanded = parent_done.expand_as(nested_done)
+                        next_tensordict.set(nested_key, nested_done | expanded)
+                        break
     def transform_observation_spec(self, observation_spec: Composite) -> Composite:
         if not isinstance(observation_spec, Composite):
             raise ValueError(

torchrl/objectives/cql.py CHANGED Viewed

@@ -298,6 +298,7 @@ class CQLLoss(LossModule):
         lagrange_thresh: float = 0.0,
         reduction: str | None = None,
         deactivate_vmap: bool = False,
+        scalar_output_mode: str | None = None,
     ) -> None:
         self._out_keys = None
         if reduction is None:
@@ -381,6 +382,23 @@ class CQLLoss(LossModule):
             )
         self._make_vmap()
         self.reduction = reduction
+        # Handle scalar_output_mode for reduction="none"
+        if reduction == "none" and scalar_output_mode is None:
+            warnings.warn(
+                "CQLLoss with reduction='none' cannot include scalar values (alpha, entropy) "
+                "in the output TensorDict without changing their shape. These values will be "
+                "excluded from the output. You can access them via `loss_module._alpha` and "
+                "compute entropy from the log_prob in the actor loss metadata. "
+                "To suppress this warning, pass `scalar_output_mode='exclude'` to the constructor. "
+                "Alternatively, pass `scalar_output_mode='non_tensor'` to include them as non-tensor data. "
+                "This is a known limitation we're working on improving.",
+                category=UserWarning,
+                stacklevel=2,
+            )
+            scalar_output_mode = "exclude"
+        self.scalar_output_mode = scalar_output_mode
         _ = self.target_entropy
     def _make_vmap(self):
@@ -548,18 +566,28 @@ class CQLLoss(LossModule):
         tensordict.set(
             self.tensor_keys.priority, metadata.pop("td_error").detach().max(0).values
         )
+        entropy = -actor_metadata.get(self.tensor_keys.log_prob)
         out = {
             "loss_actor": loss_actor,
             "loss_actor_bc": loss_actor_bc,
             "loss_qvalue": q_loss,
             "loss_cql": cql_loss,
             "loss_alpha": loss_alpha,
-            "alpha": self._alpha,
-            "entropy": -actor_metadata.get(self.tensor_keys.log_prob).mean().detach(),
         }
         if self.with_lagrange:
             out["loss_alpha_prime"] = alpha_prime_loss.mean()
-        td_loss = TensorDict(out)
+        # Handle batch_size and scalar values (alpha, entropy) based on reduction mode
+        if self.reduction == "none":
+            batch_size = tensordict.batch_size
+            td_loss = TensorDict(out, batch_size=batch_size)
+            if self.scalar_output_mode == "non_tensor":
+                td_loss.set_non_tensor("alpha", self._alpha)
+                td_loss.set_non_tensor("entropy", entropy.detach().mean())
+        else:
+            out["alpha"] = self._alpha
+            out["entropy"] = entropy.detach().mean()
+            td_loss = TensorDict(out)
         self._clear_weakrefs(
             tensordict,
             td_loss,