PyPI - torchrl - Versions diffs - 0.11.0__cp314-cp314-win_amd64.whl - Mend

torchrl 0.11.0__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (394) hide show

benchmarks/benchmark_batched_envs.py +104 -0
benchmarks/conftest.py +91 -0
benchmarks/ecosystem/gym_env_throughput.py +321 -0
benchmarks/ecosystem/vmas_rllib_vs_torchrl_sampling_performance.py +231 -0
benchmarks/requirements.txt +7 -0
benchmarks/storage/benchmark_sample_latency_over_rpc.py +193 -0
benchmarks/test_collectors_benchmark.py +240 -0
benchmarks/test_compressed_storage_benchmark.py +145 -0
benchmarks/test_envs_benchmark.py +133 -0
benchmarks/test_llm.py +101 -0
benchmarks/test_non_tensor_env_benchmark.py +70 -0
benchmarks/test_objectives_benchmarks.py +1199 -0
benchmarks/test_replaybuffer_benchmark.py +254 -0
sota-check/README.md +35 -0
sota-implementations/README.md +142 -0
sota-implementations/a2c/README.md +39 -0
sota-implementations/a2c/a2c_atari.py +291 -0
sota-implementations/a2c/a2c_mujoco.py +273 -0
sota-implementations/a2c/utils_atari.py +240 -0
sota-implementations/a2c/utils_mujoco.py +160 -0
sota-implementations/bandits/README.md +7 -0
sota-implementations/bandits/dqn.py +126 -0
sota-implementations/cql/cql_offline.py +198 -0
sota-implementations/cql/cql_online.py +249 -0
sota-implementations/cql/discrete_cql_offline.py +180 -0
sota-implementations/cql/discrete_cql_online.py +227 -0
sota-implementations/cql/utils.py +471 -0
sota-implementations/crossq/crossq.py +271 -0
sota-implementations/crossq/utils.py +320 -0
sota-implementations/ddpg/ddpg.py +231 -0
sota-implementations/ddpg/utils.py +325 -0
sota-implementations/decision_transformer/dt.py +163 -0
sota-implementations/decision_transformer/lamb.py +167 -0
sota-implementations/decision_transformer/online_dt.py +178 -0
sota-implementations/decision_transformer/utils.py +562 -0
sota-implementations/discrete_sac/discrete_sac.py +243 -0
sota-implementations/discrete_sac/utils.py +324 -0
sota-implementations/dqn/README.md +30 -0
sota-implementations/dqn/dqn_atari.py +272 -0
sota-implementations/dqn/dqn_cartpole.py +236 -0
sota-implementations/dqn/utils_atari.py +132 -0
sota-implementations/dqn/utils_cartpole.py +90 -0
sota-implementations/dreamer/README.md +129 -0
sota-implementations/dreamer/dreamer.py +586 -0
sota-implementations/dreamer/dreamer_utils.py +1107 -0
sota-implementations/expert-iteration/README.md +352 -0
sota-implementations/expert-iteration/ei_utils.py +770 -0
sota-implementations/expert-iteration/expert-iteration-async.py +512 -0
sota-implementations/expert-iteration/expert-iteration-sync.py +508 -0
sota-implementations/expert-iteration/requirements_gsm8k.txt +13 -0
sota-implementations/expert-iteration/requirements_ifeval.txt +16 -0
sota-implementations/gail/gail.py +327 -0
sota-implementations/gail/gail_utils.py +68 -0
sota-implementations/gail/ppo_utils.py +157 -0
sota-implementations/grpo/README.md +273 -0
sota-implementations/grpo/grpo-async.py +437 -0
sota-implementations/grpo/grpo-sync.py +435 -0
sota-implementations/grpo/grpo_utils.py +843 -0
sota-implementations/grpo/requirements_gsm8k.txt +11 -0
sota-implementations/grpo/requirements_ifeval.txt +16 -0
sota-implementations/impala/README.md +33 -0
sota-implementations/impala/impala_multi_node_ray.py +292 -0
sota-implementations/impala/impala_multi_node_submitit.py +284 -0
sota-implementations/impala/impala_single_node.py +261 -0
sota-implementations/impala/utils.py +184 -0
sota-implementations/iql/discrete_iql.py +230 -0
sota-implementations/iql/iql_offline.py +164 -0
sota-implementations/iql/iql_online.py +225 -0
sota-implementations/iql/utils.py +437 -0
sota-implementations/multiagent/README.md +74 -0
sota-implementations/multiagent/iql.py +237 -0
sota-implementations/multiagent/maddpg_iddpg.py +266 -0
sota-implementations/multiagent/mappo_ippo.py +267 -0
sota-implementations/multiagent/qmix_vdn.py +271 -0
sota-implementations/multiagent/sac.py +337 -0
sota-implementations/multiagent/utils/__init__.py +4 -0
sota-implementations/multiagent/utils/logging.py +151 -0
sota-implementations/multiagent/utils/utils.py +43 -0
sota-implementations/ppo/README.md +29 -0
sota-implementations/ppo/ppo_atari.py +305 -0
sota-implementations/ppo/ppo_mujoco.py +293 -0
sota-implementations/ppo/utils_atari.py +238 -0
sota-implementations/ppo/utils_mujoco.py +152 -0
sota-implementations/ppo_trainer/train.py +21 -0
sota-implementations/redq/README.md +7 -0
sota-implementations/redq/redq.py +199 -0
sota-implementations/redq/utils.py +1060 -0
sota-implementations/sac/sac-async.py +266 -0
sota-implementations/sac/sac.py +239 -0
sota-implementations/sac/utils.py +381 -0
sota-implementations/sac_trainer/train.py +16 -0
sota-implementations/td3/td3.py +254 -0
sota-implementations/td3/utils.py +319 -0
sota-implementations/td3_bc/td3_bc.py +177 -0
sota-implementations/td3_bc/utils.py +251 -0
torchrl/__init__.py +144 -0
torchrl/_extension.py +74 -0
torchrl/_torchrl.cp314-win_amd64.pyd +0 -0
torchrl/_utils.py +1431 -0
torchrl/collectors/__init__.py +48 -0
torchrl/collectors/_base.py +1058 -0
torchrl/collectors/_constants.py +88 -0
torchrl/collectors/_multi_async.py +324 -0
torchrl/collectors/_multi_base.py +1805 -0
torchrl/collectors/_multi_sync.py +464 -0
torchrl/collectors/_runner.py +581 -0
torchrl/collectors/_single.py +2009 -0
torchrl/collectors/_single_async.py +259 -0
torchrl/collectors/collectors.py +62 -0
torchrl/collectors/distributed/__init__.py +32 -0
torchrl/collectors/distributed/default_configs.py +133 -0
torchrl/collectors/distributed/generic.py +1306 -0
torchrl/collectors/distributed/ray.py +1092 -0
torchrl/collectors/distributed/rpc.py +1006 -0
torchrl/collectors/distributed/sync.py +731 -0
torchrl/collectors/distributed/utils.py +160 -0
torchrl/collectors/llm/__init__.py +10 -0
torchrl/collectors/llm/base.py +494 -0
torchrl/collectors/llm/ray_collector.py +275 -0
torchrl/collectors/llm/utils.py +36 -0
torchrl/collectors/llm/weight_update/__init__.py +10 -0
torchrl/collectors/llm/weight_update/vllm.py +348 -0
torchrl/collectors/llm/weight_update/vllm_v2.py +311 -0
torchrl/collectors/utils.py +433 -0
torchrl/collectors/weight_update.py +591 -0
torchrl/csrc/numpy_utils.h +38 -0
torchrl/csrc/pybind.cpp +27 -0
torchrl/csrc/segment_tree.h +458 -0
torchrl/csrc/torch_utils.h +34 -0
torchrl/csrc/utils.cpp +48 -0
torchrl/csrc/utils.h +31 -0
torchrl/data/__init__.py +187 -0
torchrl/data/datasets/__init__.py +58 -0
torchrl/data/datasets/atari_dqn.py +878 -0
torchrl/data/datasets/common.py +281 -0
torchrl/data/datasets/d4rl.py +489 -0
torchrl/data/datasets/d4rl_infos.py +187 -0
torchrl/data/datasets/gen_dgrl.py +375 -0
torchrl/data/datasets/minari_data.py +643 -0
torchrl/data/datasets/openml.py +177 -0
torchrl/data/datasets/openx.py +798 -0
torchrl/data/datasets/roboset.py +363 -0
torchrl/data/datasets/utils.py +11 -0
torchrl/data/datasets/vd4rl.py +432 -0
torchrl/data/llm/__init__.py +34 -0
torchrl/data/llm/dataset.py +491 -0
torchrl/data/llm/history.py +1378 -0
torchrl/data/llm/prompt.py +198 -0
torchrl/data/llm/reward.py +225 -0
torchrl/data/llm/topk.py +186 -0
torchrl/data/llm/utils.py +543 -0
torchrl/data/map/__init__.py +21 -0
torchrl/data/map/hash.py +185 -0
torchrl/data/map/query.py +204 -0
torchrl/data/map/tdstorage.py +363 -0
torchrl/data/map/tree.py +1434 -0
torchrl/data/map/utils.py +103 -0
torchrl/data/postprocs/__init__.py +8 -0
torchrl/data/postprocs/postprocs.py +391 -0
torchrl/data/replay_buffers/__init__.py +99 -0
torchrl/data/replay_buffers/checkpointers.py +622 -0
torchrl/data/replay_buffers/ray_buffer.py +292 -0
torchrl/data/replay_buffers/replay_buffers.py +2376 -0
torchrl/data/replay_buffers/samplers.py +2578 -0
torchrl/data/replay_buffers/scheduler.py +265 -0
torchrl/data/replay_buffers/storages.py +2412 -0
torchrl/data/replay_buffers/utils.py +1042 -0
torchrl/data/replay_buffers/writers.py +781 -0
torchrl/data/tensor_specs.py +7101 -0
torchrl/data/utils.py +334 -0
torchrl/envs/__init__.py +265 -0
torchrl/envs/async_envs.py +1105 -0
torchrl/envs/batched_envs.py +3093 -0
torchrl/envs/common.py +4241 -0
torchrl/envs/custom/__init__.py +11 -0
torchrl/envs/custom/chess.py +617 -0
torchrl/envs/custom/llm.py +214 -0
torchrl/envs/custom/pendulum.py +401 -0
torchrl/envs/custom/san_moves.txt +29274 -0
torchrl/envs/custom/tictactoeenv.py +288 -0
torchrl/envs/env_creator.py +263 -0
torchrl/envs/gym_like.py +752 -0
torchrl/envs/libs/__init__.py +68 -0
torchrl/envs/libs/_gym_utils.py +326 -0
torchrl/envs/libs/brax.py +846 -0
torchrl/envs/libs/dm_control.py +544 -0
torchrl/envs/libs/envpool.py +447 -0
torchrl/envs/libs/gym.py +2239 -0
torchrl/envs/libs/habitat.py +138 -0
torchrl/envs/libs/isaac_lab.py +87 -0
torchrl/envs/libs/isaacgym.py +203 -0
torchrl/envs/libs/jax_utils.py +166 -0
torchrl/envs/libs/jumanji.py +963 -0
torchrl/envs/libs/meltingpot.py +599 -0
torchrl/envs/libs/openml.py +153 -0
torchrl/envs/libs/openspiel.py +652 -0
torchrl/envs/libs/pettingzoo.py +1042 -0
torchrl/envs/libs/procgen.py +351 -0
torchrl/envs/libs/robohive.py +429 -0
torchrl/envs/libs/smacv2.py +645 -0
torchrl/envs/libs/unity_mlagents.py +891 -0
torchrl/envs/libs/utils.py +147 -0
torchrl/envs/libs/vmas.py +813 -0
torchrl/envs/llm/__init__.py +63 -0
torchrl/envs/llm/chat.py +730 -0
torchrl/envs/llm/datasets/README.md +4 -0
torchrl/envs/llm/datasets/__init__.py +17 -0
torchrl/envs/llm/datasets/gsm8k.py +353 -0
torchrl/envs/llm/datasets/ifeval.py +274 -0
torchrl/envs/llm/envs.py +789 -0
torchrl/envs/llm/libs/README.md +3 -0
torchrl/envs/llm/libs/__init__.py +8 -0
torchrl/envs/llm/libs/mlgym.py +869 -0
torchrl/envs/llm/reward/__init__.py +10 -0
torchrl/envs/llm/reward/gsm8k.py +324 -0
torchrl/envs/llm/reward/ifeval/README.md +13 -0
torchrl/envs/llm/reward/ifeval/__init__.py +10 -0
torchrl/envs/llm/reward/ifeval/_instructions.py +1667 -0
torchrl/envs/llm/reward/ifeval/_instructions_main.py +131 -0
torchrl/envs/llm/reward/ifeval/_instructions_registry.py +100 -0
torchrl/envs/llm/reward/ifeval/_instructions_util.py +1677 -0
torchrl/envs/llm/reward/ifeval/_scorer.py +454 -0
torchrl/envs/llm/transforms/__init__.py +55 -0
torchrl/envs/llm/transforms/browser.py +292 -0
torchrl/envs/llm/transforms/dataloading.py +859 -0
torchrl/envs/llm/transforms/format.py +73 -0
torchrl/envs/llm/transforms/kl.py +1544 -0
torchrl/envs/llm/transforms/policy_version.py +189 -0
torchrl/envs/llm/transforms/reason.py +323 -0
torchrl/envs/llm/transforms/tokenizer.py +321 -0
torchrl/envs/llm/transforms/tools.py +1955 -0
torchrl/envs/model_based/__init__.py +9 -0
torchrl/envs/model_based/common.py +180 -0
torchrl/envs/model_based/dreamer.py +112 -0
torchrl/envs/transforms/__init__.py +147 -0
torchrl/envs/transforms/functional.py +48 -0
torchrl/envs/transforms/gym_transforms.py +203 -0
torchrl/envs/transforms/module.py +341 -0
torchrl/envs/transforms/r3m.py +372 -0
torchrl/envs/transforms/ray_service.py +663 -0
torchrl/envs/transforms/rb_transforms.py +214 -0
torchrl/envs/transforms/transforms.py +11835 -0
torchrl/envs/transforms/utils.py +94 -0
torchrl/envs/transforms/vc1.py +307 -0
torchrl/envs/transforms/vecnorm.py +845 -0
torchrl/envs/transforms/vip.py +407 -0
torchrl/envs/utils.py +1718 -0
torchrl/envs/vec_envs.py +11 -0
torchrl/modules/__init__.py +206 -0
torchrl/modules/distributions/__init__.py +73 -0
torchrl/modules/distributions/continuous.py +830 -0
torchrl/modules/distributions/discrete.py +908 -0
torchrl/modules/distributions/truncated_normal.py +187 -0
torchrl/modules/distributions/utils.py +233 -0
torchrl/modules/llm/__init__.py +62 -0
torchrl/modules/llm/backends/__init__.py +65 -0
torchrl/modules/llm/backends/vllm/__init__.py +94 -0
torchrl/modules/llm/backends/vllm/_models.py +46 -0
torchrl/modules/llm/backends/vllm/base.py +72 -0
torchrl/modules/llm/backends/vllm/vllm_async.py +2075 -0
torchrl/modules/llm/backends/vllm/vllm_plugin.py +22 -0
torchrl/modules/llm/backends/vllm/vllm_sync.py +446 -0
torchrl/modules/llm/backends/vllm/vllm_utils.py +129 -0
torchrl/modules/llm/policies/__init__.py +28 -0
torchrl/modules/llm/policies/common.py +1809 -0
torchrl/modules/llm/policies/transformers_wrapper.py +2756 -0
torchrl/modules/llm/policies/vllm_wrapper.py +2241 -0
torchrl/modules/llm/utils.py +23 -0
torchrl/modules/mcts/__init__.py +21 -0
torchrl/modules/mcts/scores.py +579 -0
torchrl/modules/models/__init__.py +86 -0
torchrl/modules/models/batchrenorm.py +119 -0
torchrl/modules/models/decision_transformer.py +179 -0
torchrl/modules/models/exploration.py +731 -0
torchrl/modules/models/llm.py +156 -0
torchrl/modules/models/model_based.py +596 -0
torchrl/modules/models/models.py +1712 -0
torchrl/modules/models/multiagent.py +1067 -0
torchrl/modules/models/recipes/impala.py +185 -0
torchrl/modules/models/utils.py +162 -0
torchrl/modules/planners/__init__.py +10 -0
torchrl/modules/planners/cem.py +228 -0
torchrl/modules/planners/common.py +73 -0
torchrl/modules/planners/mppi.py +265 -0
torchrl/modules/tensordict_module/__init__.py +89 -0
torchrl/modules/tensordict_module/actors.py +2457 -0
torchrl/modules/tensordict_module/common.py +529 -0
torchrl/modules/tensordict_module/exploration.py +814 -0
torchrl/modules/tensordict_module/probabilistic.py +321 -0
torchrl/modules/tensordict_module/rnn.py +1639 -0
torchrl/modules/tensordict_module/sequence.py +132 -0
torchrl/modules/tensordict_module/world_models.py +34 -0
torchrl/modules/utils/__init__.py +38 -0
torchrl/modules/utils/mappings.py +9 -0
torchrl/modules/utils/utils.py +89 -0
torchrl/objectives/__init__.py +78 -0
torchrl/objectives/a2c.py +659 -0
torchrl/objectives/common.py +753 -0
torchrl/objectives/cql.py +1346 -0
torchrl/objectives/crossq.py +710 -0
torchrl/objectives/ddpg.py +453 -0
torchrl/objectives/decision_transformer.py +371 -0
torchrl/objectives/deprecated.py +516 -0
torchrl/objectives/dqn.py +683 -0
torchrl/objectives/dreamer.py +488 -0
torchrl/objectives/functional.py +48 -0
torchrl/objectives/gail.py +258 -0
torchrl/objectives/iql.py +996 -0
torchrl/objectives/llm/__init__.py +30 -0
torchrl/objectives/llm/grpo.py +846 -0
torchrl/objectives/llm/sft.py +482 -0
torchrl/objectives/multiagent/__init__.py +8 -0
torchrl/objectives/multiagent/qmixer.py +396 -0
torchrl/objectives/ppo.py +1669 -0
torchrl/objectives/redq.py +683 -0
torchrl/objectives/reinforce.py +530 -0
torchrl/objectives/sac.py +1580 -0
torchrl/objectives/td3.py +570 -0
torchrl/objectives/td3_bc.py +625 -0
torchrl/objectives/utils.py +782 -0
torchrl/objectives/value/__init__.py +28 -0
torchrl/objectives/value/advantages.py +1956 -0
torchrl/objectives/value/functional.py +1459 -0
torchrl/objectives/value/utils.py +360 -0
torchrl/record/__init__.py +17 -0
torchrl/record/loggers/__init__.py +23 -0
torchrl/record/loggers/common.py +48 -0
torchrl/record/loggers/csv.py +226 -0
torchrl/record/loggers/mlflow.py +142 -0
torchrl/record/loggers/tensorboard.py +139 -0
torchrl/record/loggers/trackio.py +163 -0
torchrl/record/loggers/utils.py +78 -0
torchrl/record/loggers/wandb.py +214 -0
torchrl/record/recorder.py +554 -0
torchrl/services/__init__.py +79 -0
torchrl/services/base.py +109 -0
torchrl/services/ray_service.py +453 -0
torchrl/testing/__init__.py +107 -0
torchrl/testing/assertions.py +179 -0
torchrl/testing/dist_utils.py +122 -0
torchrl/testing/env_creators.py +227 -0
torchrl/testing/env_helper.py +35 -0
torchrl/testing/gym_helpers.py +156 -0
torchrl/testing/llm_mocks.py +119 -0
torchrl/testing/mocking_classes.py +2720 -0
torchrl/testing/modules.py +295 -0
torchrl/testing/mp_helpers.py +15 -0
torchrl/testing/ray_helpers.py +293 -0
torchrl/testing/utils.py +190 -0
torchrl/trainers/__init__.py +42 -0
torchrl/trainers/algorithms/__init__.py +11 -0
torchrl/trainers/algorithms/configs/__init__.py +705 -0
torchrl/trainers/algorithms/configs/collectors.py +216 -0
torchrl/trainers/algorithms/configs/common.py +41 -0
torchrl/trainers/algorithms/configs/data.py +308 -0
torchrl/trainers/algorithms/configs/envs.py +104 -0
torchrl/trainers/algorithms/configs/envs_libs.py +361 -0
torchrl/trainers/algorithms/configs/logging.py +80 -0
torchrl/trainers/algorithms/configs/modules.py +570 -0
torchrl/trainers/algorithms/configs/objectives.py +177 -0
torchrl/trainers/algorithms/configs/trainers.py +340 -0
torchrl/trainers/algorithms/configs/transforms.py +955 -0
torchrl/trainers/algorithms/configs/utils.py +252 -0
torchrl/trainers/algorithms/configs/weight_sync_schemes.py +191 -0
torchrl/trainers/algorithms/configs/weight_update.py +159 -0
torchrl/trainers/algorithms/ppo.py +373 -0
torchrl/trainers/algorithms/sac.py +308 -0
torchrl/trainers/helpers/__init__.py +40 -0
torchrl/trainers/helpers/collectors.py +416 -0
torchrl/trainers/helpers/envs.py +573 -0
torchrl/trainers/helpers/logger.py +33 -0
torchrl/trainers/helpers/losses.py +132 -0
torchrl/trainers/helpers/models.py +658 -0
torchrl/trainers/helpers/replay_buffer.py +59 -0
torchrl/trainers/helpers/trainers.py +301 -0
torchrl/trainers/trainers.py +2052 -0
torchrl/weight_update/__init__.py +33 -0
torchrl/weight_update/_distributed.py +749 -0
torchrl/weight_update/_mp.py +624 -0
torchrl/weight_update/_noupdate.py +102 -0
torchrl/weight_update/_ray.py +1032 -0
torchrl/weight_update/_rpc.py +284 -0
torchrl/weight_update/_shared.py +891 -0
torchrl/weight_update/llm/__init__.py +32 -0
torchrl/weight_update/llm/vllm_double_buffer.py +370 -0
torchrl/weight_update/llm/vllm_nccl.py +710 -0
torchrl/weight_update/utils.py +73 -0
torchrl/weight_update/weight_sync_schemes.py +1244 -0
torchrl-0.11.0.dist-info/LICENSE +21 -0
torchrl-0.11.0.dist-info/METADATA +1307 -0
torchrl-0.11.0.dist-info/RECORD +394 -0
torchrl-0.11.0.dist-info/WHEEL +5 -0
torchrl-0.11.0.dist-info/entry_points.txt +2 -0
torchrl-0.11.0.dist-info/top_level.txt +7 -0

sota-implementations/expert-iteration/ei_utils.py ADDED Viewed

@@ -0,0 +1,770 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+import time
+from typing import Any, Literal
+import torch
+from omegaconf import DictConfig
+from tensordict import TensorDict
+from torch import device as torch_device, dtype as torch_dtype
+from torchrl._utils import logger as torchrl_logger
+from torchrl.envs.llm import RetrieveLogProb
+from torchrl.envs.llm.datasets.ifeval import IFEvalEnv
+from torchrl.modules.llm import TransformersWrapper, vLLMWrapper
+from torchrl.weight_update.llm import VLLMWeightSyncScheme
+from transformers.models.auto.modeling_auto import AutoModelForCausalLM
+from transformers.tokenization_utils import PreTrainedTokenizer
+try:
+    import ray
+except ImportError:
+    ray = None
+def get_tokenizer(cfg: DictConfig) -> PreTrainedTokenizer:
+    from transformers import AutoTokenizer
+    model_name = cfg.model.name
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # tokenizer.eos_token = "<|im_end|>"
+    if tokenizer.pad_token == tokenizer.eos_token:
+        tokenizer.pad_token = "PAD"
+    tokenizer.padding_side = "left"
+    return tokenizer
+def make_env(cfg: DictConfig, devices: list[int] | None = None):
+    """Create the environment with proper device allocation.
+    Args:
+        cfg: The configuration object
+        devices: The devices to use for the reference model
+    Returns:
+        The configured environment
+    """
+    # Create reference model with proper device allocation
+    # For the collector actor, we want inference_model devices first, then ref_model devices
+    train_tokenizer = get_tokenizer(cfg)
+    # Get device information
+    num_inf_devices = cfg.inference_model.num_devices
+    num_ref_devices = cfg.ref_model.num_devices
+    num_inf_devices + num_ref_devices
+    # Create a new config with adjusted device assignments
+    ref_cfg = DictConfig(dict(cfg))
+    ref_model = get_ref_model(ref_cfg, train_tokenizer, devices=devices)
+    # Setup environment
+    if cfg.env.dataset == "gsm8k":
+        from torchrl.envs.llm import GSM8KEnv
+        env = GSM8KEnv(
+            repeats=cfg.env.repeats,
+            tokenizer=train_tokenizer,
+            num_envs=cfg.env.num_envs,
+            device=torch.device("cpu"),
+        )
+    else:  # ifeval
+        env = IFEvalEnv(
+            repeats=cfg.env.repeats,
+            tokenizer=train_tokenizer,
+            num_envs=cfg.env.num_envs,
+            device=torch.device("cpu"),
+        )
+    # Pass device directly to RetrieveLogProb - Since, for Ray, the local device is always 0
+    # we can just use 0 here.
+    device = torch.device("cuda:0")
+    env = env.append_transform(
+        RetrieveLogProb(
+            model=ref_model,
+            assistant_only=True,
+            tokenizer_kwargs={"chat_template_name": "qwen"},
+            device=device,
+            log_probs_full_key=("ref_log_probs", "full"),
+        )
+    )
+    return env
+def get_train_model(
+    cfg: DictConfig,
+    devices: list[int] | None = None,
+    chat_template_name: str | None = None,
+) -> tuple[TransformersWrapper, PreTrainedTokenizer]:
+    """Creates and configures the training model with LoRA adapters.
+    This function initializes the main training model with LoRA adapters and other
+    training-specific configurations like gradient checkpointing. The model is wrapped
+    in a TransformersWrapper for policy training.
+    Args:
+        cfg (DictConfig): The hydra configuration object containing model and training settings.
+            Expected to have train_model section with LoRA, quantization, and other
+            training-specific parameters.
+        devices (list[int] | None, optional): The devices to use for the training model. Defaults to `None`.
+        chat_template_name (str | None, optional): The name of the chat template to use. Defaults to `None`.
+    Returns:
+        tuple[TransformersWrapper, PreTrainedTokenizer]:
+            - policy_training: The wrapped training model
+            - train_tokenizer: The tokenizer for the model
+    Raises:
+        RuntimeError: If CUDA is not available or if device allocation fails
+    """
+    torchrl_logger.info("Creating train model")
+    # Set model dtype explicitly
+    model_dtype = getattr(torch, cfg.train_model.torch_dtype)
+    # Get configured devices or default to [0]
+    train_devices = devices if devices is not None else [0]
+    # Create max_memory dict - set 0 memory for GPUs we don't want to use
+    max_memory = {}
+    for i in range(torch.cuda.device_count()):
+        if i in train_devices:
+            max_memory[i] = "24GiB"  # Allow max memory for devices we want to use
+        else:
+            max_memory[i] = "0GiB"  # No memory for other devices
+    max_memory["cpu"] = "24GiB"  # Allow CPU memory as fallback
+    # Let HF handle distribution with max_memory
+    device_map = "balanced" if len(train_devices) > 1 else f"cuda:{train_devices[0]}"
+    train_model, train_tokenizer = get_hf_model(
+        cfg.model.name,
+        device_map=device_map,
+        max_memory=max_memory,
+        lora=cfg.train_model.lora.enabled,
+        lora_r=cfg.train_model.lora.r,
+        lora_alpha=cfg.train_model.lora.alpha,
+        lora_dropout=cfg.train_model.lora.dropout,
+        gradient_checkpointing=cfg.train_model.gradient_checkpointing,
+        quantize=cfg.train_model.quantization.enabled,
+        torch_dtype=model_dtype,
+        attn_implementation=cfg.train_model.attn_implementation,
+        compile=cfg.model.compile,
+        eval_mode=cfg.train_model.eval,
+    )
+    # Force all model parameters to the same dtype
+    for param in train_model.parameters():
+        param.data = param.data.to(model_dtype)
+    policy_training = TransformersWrapper(
+        train_model,
+        tokenizer=train_tokenizer,
+        input_mode="history",
+        generate=False,
+        return_log_probs=True,
+        pad_output=False,
+        device=torch.device("cuda:0"),
+    )
+    # Ensure model stays in eval mode after wrapping
+    policy_training.model.eval()
+    policy_training.model.train(False)
+    return policy_training, train_tokenizer
+def get_inference_model(
+    cfg: DictConfig,
+    devices: list[int] | None = None,
+    make_ray_worker: bool = True,
+    tokenizer: PreTrainedTokenizer | None = None,
+) -> vLLMWrapper:
+    """Creates the vLLM-based inference model for fast generation.
+    This function initializes a vLLM model server for efficient inference and wraps
+    it in a vLLMWrapper for policy inference. vLLM provides optimized generation
+    with better throughput than standard HuggingFace generation.
+    Args:
+        cfg (DictConfig): The hydra configuration object containing model settings.
+            Expected to have inference_model section with vLLM-specific parameters
+            like gpu_memory_utilization and generation settings.
+        devices (list[int], optional): The devices to use for the inference model. Default: `None`.
+        make_ray_worker (bool, optional): Whether to make a ray worker. Default: `True`.
+        tokenizer (PreTrainedTokenizer | None, optional): The tokenizer to use. Default: None
+    Returns:
+        vLLMWrapper: The wrapped vLLM model ready for inference.
+    Raises:
+        AssertionError: If the vLLM server or model initialization fails
+    """
+    from torchrl.modules.llm.backends import make_vllm_worker
+    num_devices = cfg.inference_model.num_devices
+    if num_devices is None:
+        vllm_devices = devices if devices is not None else [1]
+    else:
+        vllm_devices = None
+    torchrl_logger.info(
+        f"Creating inference model with num_devices={num_devices}, devices={vllm_devices}"
+    )
+    model_name = cfg.model.name
+    if tokenizer is None:
+        tokenizer = get_tokenizer(cfg)
+    # vLLM handles device mapping internally
+    inference_server = make_vllm_worker(
+        model_name=model_name,
+        gpu_memory_utilization=cfg.inference_model.gpu_memory_utilization,
+        num_devices=num_devices,
+        devices=list(vllm_devices)
+        if vllm_devices is not None
+        else None,  # Convert to list for type compatibility
+        make_ray_worker=make_ray_worker,
+        enforce_eager=cfg.inference_model.enforce_eager,
+    )
+    assert inference_server is not None
+    policy = vLLMWrapper(
+        inference_server,
+        input_mode="history",
+        chat_template_name="qwen",
+        return_log_probs=True,
+        tokenizer=tokenizer,
+        pad_output=False,
+        generate_kwargs={
+            "max_tokens": cfg.inference_model.max_tokens,
+            "include_stop_str_in_output": cfg.inference_model.include_stop_str_in_output,
+            "temperature": cfg.inference_model.temperature,
+        },
+    )
+    assert policy.model is not None
+    return policy
+def get_ref_model(
+    cfg: DictConfig, tokenizer: PreTrainedTokenizer, devices: list[int] | None = None
+) -> TransformersWrapper:
+    """Creates the reference model for KL penalty computation.
+    This function initializes a frozen copy of the base model to serve as the
+    reference model for KL divergence computation. The reference model is typically
+    quantized and does not require gradient computation.
+    Args:
+        cfg (DictConfig): The hydra configuration object containing model settings.
+            Expected to have ref_model section with quantization and attention settings.
+        tokenizer (PreTrainedTokenizer): The tokenizer to use with the reference model.
+    Returns:
+        TransformersWrapper: The wrapped reference model in eval mode with detached weights.
+    """
+    torchrl_logger.info("Creating ref model")
+    # Get configured devices or default to [2]
+    if cfg.ref_model.num_devices is None:
+        ref_devices = devices if devices is not None else [2]
+    else:
+        ref_devices = list(range(cfg.ref_model.num_devices))
+    # Create max_memory dict - set 0 memory for GPUs we don't want to use
+    max_memory = {}
+    for i in range(torch.cuda.device_count()):
+        if i in ref_devices:
+            max_memory[i] = "24GiB"  # Allow max memory for devices we want to use
+        else:
+            max_memory[i] = "0GiB"  # No memory for other devices
+    max_memory["cpu"] = "24GiB"  # Allow CPU memory as fallback
+    # Let HF handle distribution with max_memory
+    device_map = "balanced" if len(ref_devices) > 1 else f"cuda:{ref_devices[0]}"
+    model_name = cfg.model.name
+    ref_model = get_hf_model(
+        model_name,
+        device_map=device_map,
+        max_memory=max_memory,
+        torch_dtype=getattr(torch, cfg.ref_model.torch_dtype),
+        quantize=cfg.ref_model.quantization.enabled,
+        gradient_checkpointing=cfg.ref_model.gradient_checkpointing,
+        attn_implementation=cfg.ref_model.attn_implementation,
+        lora=False,  # Reference model doesn't need LoRA
+        requires_grad=False,
+        eval_mode=True,
+        lora_dropout=0.0,
+    )[0]
+    # Detach weights
+    TensorDict.from_module(ref_model).data.to_module(ref_model)
+    ref_model = TransformersWrapper(
+        ref_model,
+        tokenizer=tokenizer,
+        input_mode="history",
+        generate=False,
+        return_log_probs=True,
+        pad_output=False,
+        device=torch.device("cuda:0"),
+        chat_template_name="qwen",
+    )
+    return ref_model
+def get_hf_model(
+    model_name: str,
+    torch_dtype: torch_dtype = torch.float32,
+    lora_r: int = 8,
+    lora_alpha: int = 16,
+    lora_dropout: float = 0.1,
+    quantize: bool = False,
+    fsdp: str = "",
+    fsdp_config: Any = None,
+    gradient_checkpointing: bool = True,
+    device_map: str
+    | dict[str, int | str | torch_device]
+    | int
+    | torch_device
+    | None = None,
+    lora: bool = True,
+    attn_implementation: Literal["flash_attention_2", "flex_attention", "sdpa"]
+    | None = "flex_attention",
+    requires_grad: bool = True,
+    compile: bool = False,
+    max_memory: dict[str, str] | None = None,
+    eval_mode: bool = False,
+) -> tuple[AutoModelForCausalLM, PreTrainedTokenizer]:
+    """Creates and configures a HuggingFace model with optional optimizations.
+    Args:
+        model_name (str): HuggingFace model identifier (e.g., "Qwen/Qwen2.5-3B")
+        torch_dtype (torch.dtype, optional): Model precision. Default: torch.float32
+        lora_r (int, optional): LoRA rank - controls capacity of adaptations. Default: 8
+        lora_alpha (int, optional): LoRA alpha - scales the adaptations. Default: 16
+        lora_dropout (float, optional): Dropout probability for LoRA layers. Default: 0.1
+        quantize (bool, optional): Whether to enable 4-bit quantization. Default: False
+        fsdp (str, optional): Fully Sharded Data Parallel configuration. Default: ""
+        fsdp_config (Any, optional): Additional FSDP configurations. Default: None
+        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Default: True
+        device_map (str | dict | int | torch.device | None, optional): Device placement strategy. Default: None
+        lora (bool, optional): Whether to apply LoRA adapters. Default: True
+        attn_implementation (Literal["flash_attention_2", "flex_attention", "sdpa"] | None, optional):
+            Attention implementation to use. Default: "flex_attention"
+        requires_grad (bool, optional): Whether to enable gradient computation. Default: True
+        compile (bool, optional): Whether to enable model compilation. Default: False
+        max_memory (dict[str, str], optional): Memory configuration for distributed training. Default: {}
+        eval_mode (bool, optional): Whether to use the model in eval mode. Default: False
+    Returns:
+        tuple[AutoModelForCausalLM, PreTrainedTokenizer]:
+            - model: The configured HuggingFace model
+            - tokenizer: The associated tokenizer
+    Raises:
+        ImportError: If required dependencies are not installed
+        RuntimeError: If model initialization fails
+    """
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    if max_memory is None:
+        max_memory = {}
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # tokenizer.eos_token = "<|im_end|>"
+    if tokenizer.pad_token == tokenizer.eos_token:
+        tokenizer.pad_token = "PAD"
+    tokenizer.padding_side = "left"
+    # Configure model settings for mixed precision
+    # Store original dtype to restore it later
+    original_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(torch_dtype)
+    model_configs = {
+        "torch_dtype": torch_dtype,
+        "device_map": device_map if device_map is not None else "auto",
+        "max_memory": max_memory,
+    }
+    if torch.cuda.is_available() and attn_implementation:
+        torchrl_logger.info(f"{attn_implementation} init")
+        model_configs["attn_implementation"] = attn_implementation
+    try:
+        # Configure training settings based on FSDP usage
+        if fsdp != "" and fsdp_config is not None:
+            torchrl_logger.info("Configurations for FSDP")
+        else:
+            pass
+        # Enable Quantization
+        if quantize:
+            try:
+                from transformers.utils.quantization_config import BitsAndBytesConfig
+            except ImportError:
+                raise ImportError(
+                    "Please install transformers with bitsandbytes support"
+                )
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch_dtype,
+            )
+            model_configs["quantization_config"] = bnb_config
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            use_cache=not gradient_checkpointing,
+            cache_dir="/tmp/.cache",
+            **model_configs,
+        )
+        # Configure gradient checkpointing based on FSDP usage
+        if fsdp == "" and fsdp_config is None:
+            if gradient_checkpointing:
+                torchrl_logger.info("gradient_checkpointing enabled")
+                model.gradient_checkpointing_enable()
+        else:
+            if gradient_checkpointing:
+                torchrl_logger.info("gradient_checkpointing enabled")
+                model.gradient_checkpointing_enable(
+                    gradient_checkpointing_kwargs={"use_reentrant": False}
+                )
+        if lora:
+            try:
+                from peft import get_peft_model, LoraConfig
+            except ImportError:
+                raise ImportError("Please install peft: pip install peft")
+            # Create LoRA config with explicit dtype setting
+            lora_config = LoraConfig(
+                r=lora_r,
+                lora_alpha=lora_alpha,
+                target_modules="all-linear",
+                lora_dropout=lora_dropout,  # Standard dropout for regularization
+                bias="none",
+                task_type="CAUSAL_LM",
+                inference_mode=not eval_mode,  # CRITICAL: Must be False for training
+                init_lora_weights=True,  # Good practice
+            )
+            # Initialize LoRA model
+            model = get_peft_model(
+                model,
+                lora_config,
+                autocast_adapter_dtype=False,  # Prevent automatic casting of adapter layers
+            )
+            # Force LoRA layers to correct dtype and eval mode
+            for n, p in model.named_parameters():
+                if "lora_" in n:  # Only convert LoRA parameters
+                    p.data = p.data.to(torch_dtype)
+        if eval_mode:
+            model.eval()  # Ensure model is in eval mode
+        else:
+            model.train(True)
+        if requires_grad:
+            model.requires_grad_(True)
+        else:
+            model.requires_grad_(False)
+        return model, tokenizer
+    finally:
+        # Restore original dtype
+        torch.set_default_dtype(original_dtype)
+def make_weight_sync_scheme(
+    master_address=None,
+    master_port=None,
+    vllm_tp_size=1,
+) -> VLLMWeightSyncScheme:
+    """Creates a vLLM weight synchronization scheme using NCCL collectives.
+    This function creates a weight sync scheme that uses NCCL for high-performance
+    GPU-to-GPU weight transfers from the training model to vLLM inference workers.
+    Args:
+        master_address (Optional[str]): Address of the master node for distributed init.
+            Defaults to "localhost".
+        master_port (Optional[int]): Port of the master node for distributed init.
+            If None, will auto-assign.
+        vllm_tp_size (int): vLLM tensor parallel size (gpus_per_replica). Defaults to 1.
+    Returns:
+        VLLMWeightSyncScheme: A weight sync scheme configured for the vLLM engine.
+    """
+    if master_address is None:
+        master_address = "localhost"
+    torchrl_logger.info(
+        f"Creating VLLMWeightSyncScheme with tp_size={vllm_tp_size}, "
+        f"master_address={master_address}, master_port={master_port}"
+    )
+    return VLLMWeightSyncScheme(
+        master_address=master_address,
+        master_port=master_port,
+        gpus_per_replica=vllm_tp_size,
+        num_replicas=1,  # For expert iteration, typically 1 replica
+        strategy="state_dict",
+    )
+def compute_device_allocation(cfg):
+    """Compute device allocation for different model components.
+    Args:
+        cfg: The configuration object
+    Returns:
+        A dictionary containing device allocations for different components
+    """
+    train_devices = cfg.train_model.num_devices
+    inf_devices = cfg.inference_model.num_devices
+    ref_devices = cfg.ref_model.num_devices
+    # So we need all GPUs for Ray
+    train_start = 0
+    train_end = train_devices
+    inference_start = 0
+    inference_end = inf_devices
+    ref_start = inference_end
+    ref_end = ref_start + ref_devices
+    ray_num_gpus = train_devices + inf_devices + ref_devices
+    # Create device lists
+    train_model_devices = list(range(train_start, train_end))
+    inference_model_devices = list(range(inference_start, inference_end))
+    ref_model_devices = list(range(ref_start, ref_end))
+    # Get total unique devices for CUDA_VISIBLE_DEVICES
+    all_devices = sorted(
+        set(train_model_devices + inference_model_devices + ref_model_devices)
+    )
+    cuda_visible_devices = ",".join(map(str, all_devices))
+    return {
+        "train_model_devices": train_model_devices,
+        "inference_model_devices": inference_model_devices,
+        "ref_model_devices": ref_model_devices,
+        "ray_num_gpus": ray_num_gpus,
+        "cuda_visible_devices": cuda_visible_devices,
+    }
+def create_cosine_scheduler_with_warmup(
+    optimizer: torch.optim.Optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float = 0.5,
+) -> torch.optim.lr_scheduler.LRScheduler:
+    """Create a cosine scheduler with warmup using PyTorch's built-in schedulers.
+    This function creates a learning rate scheduler that:
+    1. Linearly increases the learning rate from 0 to the base learning rate during warmup
+    2. Follows a cosine curve from the base learning rate to 0 after warmup
+    Args:
+        optimizer: The optimizer to schedule learning rates for
+        num_warmup_steps: Number of warmup steps
+        num_training_steps: Total number of training steps
+        num_cycles: Number of cosine cycles (default: 0.5 for half a cycle)
+    Returns:
+        A PyTorch learning rate scheduler
+    """
+    from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
+    # Create warmup scheduler (linear increase from 0 to base LR)
+    warmup_scheduler = LinearLR(
+        optimizer, start_factor=0.01, end_factor=1.0, total_iters=num_warmup_steps
+    )
+    # Create cosine decay scheduler (from base LR to 0)
+    cosine_scheduler = CosineAnnealingLR(
+        optimizer, T_max=num_training_steps - num_warmup_steps, eta_min=0.0
+    )
+    # Combine warmup and cosine decay
+    scheduler = SequentialLR(
+        optimizer,
+        schedulers=[warmup_scheduler, cosine_scheduler],
+        milestones=[num_warmup_steps],
+    )
+    return scheduler
+def get_wandb_run_id(wandb_logger):
+    """Get the wandb run ID from a WandbLogger instance.
+    Args:
+        wandb_logger: The WandbLogger instance
+    Returns:
+        str: The wandb run ID, or None if not available
+    """
+    try:
+        # Wait a bit for wandb to initialize
+        max_attempts = 10
+        for attempt in range(max_attempts):
+            if hasattr(wandb_logger, "experiment") and wandb_logger.experiment:
+                run_id = wandb_logger.experiment.id
+                if run_id:
+                    torchrl_logger.info(f"Got wandb run ID: {run_id}")
+                    return run_id
+            if attempt < max_attempts - 1:
+                time.sleep(0.5)
+                torchrl_logger.info(
+                    f"Waiting for wandb run ID, attempt {attempt + 1}/{max_attempts}"
+                )
+        torchrl_logger.warning("Could not get wandb run ID after multiple attempts")
+        return None
+    except Exception as e:
+        torchrl_logger.error(f"Error getting wandb run ID: {e}")
+        return None
+def log_training_metrics(
+    wandb_logger,
+    replay_buffer,
+    batch,
+    loss,
+    grad_norm,
+    global_step,
+    data_read_count,
+    collector,
+    start_time,
+    gradient_accumulation_steps,
+    history_str=None,
+):
+    """Log training metrics to wandb.
+    Args:
+        wandb_logger: The wandb logger instance
+        replay_buffer: The replay buffer containing collected data
+        batch: The current training batch
+        loss: The computed loss object
+        grad_norm: The gradient norm value
+        global_step: Current global training step
+        data_read_count: Total data read count
+        collector: The collector instance
+        start_time: Training start time
+        gradient_accumulation_steps: Number of gradient accumulation steps
+        history_str: Optional history string for logging
+    """
+    with torch.no_grad():
+        rb_content = replay_buffer[:]
+        batch_policy_version = batch["next", "policy_version"].view(-1).min()
+        batch_policy_age = collector.policy_version - batch_policy_version
+        metrics = {
+            "reward from buffer": float(
+                torch.cat(rb_content.get(("next", "reward"), as_list=True)).mean()
+            ),
+            "reward from batch": float(batch["next", "reward"].mean()),
+            "seq_length from buffer": float(
+                torch.tensor(
+                    [
+                        t.numel()
+                        for t in rb_content.get(("tokens", "response"), as_list=True)
+                    ],
+                    dtype=torch.float,
+                ).mean()
+            ),
+            "loss_sft, from loss": float(loss.loss_sft),
+            "loss_kl_to_ref, from loss": float(loss.loss_kl_to_ref),
+            "kl_to_ref, from loss": float(loss.kl_to_ref),
+            "grad_norm": float(grad_norm)
+            if global_step % gradient_accumulation_steps == 0
+            else 0.0,
+            "write_count, from buffer": int(replay_buffer.write_count),
+            # how many gradient steps per write
+            "gradient_step_throughput (gradient step per write)": float(
+                global_step / replay_buffer.write_count
+            ),
+            # how many optim steps per write
+            "optim_step_throughput (optim step per write)": float(
+                (global_step // gradient_accumulation_steps) / replay_buffer.write_count
+            ),
+            "data_read_count (total)": data_read_count,
+            "current_policy_version (collector)": collector.policy_version,
+            # FIXME: Assume batch is a single trajectory
+            # FIXME: The addition of the transform after the env instantiation + _shuttle creation
+            #  is messed up - we need the next data
+            "batch_policy_version (sampled batch)": batch_policy_version,
+            "batch_policy_age (sampled batch)": batch_policy_age,
+            "throughput (steps per second)": float(
+                global_step / (time.time() - start_time)
+            ),
+        }
+        for name, value in metrics.items():
+            wandb_logger.log_scalar(name, value, step=global_step)
+        if history_str is not None:
+            wandb_logger.log_str("history", history_str, step=global_step)
+class RemoteDataLogger:
+    """A remote post-processing function that sends logging data to the main process via Ray for centralized logging."""
+    def __init__(self, log_queue):
+        """Initialize RemoteDataLogger with a Ray actor reference for logging.
+        Args:
+            log_queue: Ray queue for logging data.
+        """
+        self.log_queue = log_queue
+        self.last_time = None
+    def __call__(self, data: TensorDict):
+        self.log_data(data)
+        return data
+    def log_data(self, data: TensorDict):
+        logs = {}
+        if self.last_time is None:
+            self.last_time = time.time()
+        else:
+            t = time.time()
+            elapsed = t - self.last_time
+            logs["collector/time/elapsed"] = elapsed
+            self.last_time = t
+        # Prepare logging data
+        logs["collector/rewards/mean"] = float(data["next", "reward"].mean())
+        logs["collector/rewards/std"] = float(data["next", "reward"].std())
+        logs["collector/rewards/min"] = float(data["next", "reward"].min())
+        logs["collector/rewards/max"] = float(data["next", "reward"].max())
+        # Response length
+        lengths = []
+        responses = data["text", "response"]
+        for r in responses:
+            lengths.append(len(r))
+        lengths = torch.tensor(lengths, dtype=torch.float32)
+        logs["collector/response_length/mean"] = float(lengths.mean())
+        logs["collector/response_length/std"] = float(lengths.std())
+        logs["collector/response_length/min"] = float(lengths.min())
+        logs["collector/response_length/max"] = float(lengths.max())
+        policy_versions = data.get(("next", "policy_version"))
+        if isinstance(policy_versions, torch.Tensor):
+            policy_versions = policy_versions.float()
+            logs["collector/policy_version/mean"] = float(policy_versions.mean())
+            logs["collector/policy_version/min"] = float(policy_versions.min())
+            logs["collector/policy_version/max"] = float(policy_versions.max())
+        # Send to main process via Ray actor
+        try:
+            self.log_queue.put(logs)
+        except Exception as e:
+            torchrl_logger.error(f"Failed to send logs to main process: {e}")