PyPI - torchrl - Versions diffs - 0.11.0__cp314-cp314-manylinux_2_28_aarch64.whl - Mend

torchrl 0.11.0__cp314-cp314-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (394) hide show

benchmarks/benchmark_batched_envs.py +104 -0
benchmarks/conftest.py +91 -0
benchmarks/ecosystem/gym_env_throughput.py +321 -0
benchmarks/ecosystem/vmas_rllib_vs_torchrl_sampling_performance.py +231 -0
benchmarks/requirements.txt +7 -0
benchmarks/storage/benchmark_sample_latency_over_rpc.py +193 -0
benchmarks/test_collectors_benchmark.py +240 -0
benchmarks/test_compressed_storage_benchmark.py +145 -0
benchmarks/test_envs_benchmark.py +133 -0
benchmarks/test_llm.py +101 -0
benchmarks/test_non_tensor_env_benchmark.py +70 -0
benchmarks/test_objectives_benchmarks.py +1199 -0
benchmarks/test_replaybuffer_benchmark.py +254 -0
sota-check/README.md +35 -0
sota-implementations/README.md +142 -0
sota-implementations/a2c/README.md +39 -0
sota-implementations/a2c/a2c_atari.py +291 -0
sota-implementations/a2c/a2c_mujoco.py +273 -0
sota-implementations/a2c/utils_atari.py +240 -0
sota-implementations/a2c/utils_mujoco.py +160 -0
sota-implementations/bandits/README.md +7 -0
sota-implementations/bandits/dqn.py +126 -0
sota-implementations/cql/cql_offline.py +198 -0
sota-implementations/cql/cql_online.py +249 -0
sota-implementations/cql/discrete_cql_offline.py +180 -0
sota-implementations/cql/discrete_cql_online.py +227 -0
sota-implementations/cql/utils.py +471 -0
sota-implementations/crossq/crossq.py +271 -0
sota-implementations/crossq/utils.py +320 -0
sota-implementations/ddpg/ddpg.py +231 -0
sota-implementations/ddpg/utils.py +325 -0
sota-implementations/decision_transformer/dt.py +163 -0
sota-implementations/decision_transformer/lamb.py +167 -0
sota-implementations/decision_transformer/online_dt.py +178 -0
sota-implementations/decision_transformer/utils.py +562 -0
sota-implementations/discrete_sac/discrete_sac.py +243 -0
sota-implementations/discrete_sac/utils.py +324 -0
sota-implementations/dqn/README.md +30 -0
sota-implementations/dqn/dqn_atari.py +272 -0
sota-implementations/dqn/dqn_cartpole.py +236 -0
sota-implementations/dqn/utils_atari.py +132 -0
sota-implementations/dqn/utils_cartpole.py +90 -0
sota-implementations/dreamer/README.md +129 -0
sota-implementations/dreamer/dreamer.py +586 -0
sota-implementations/dreamer/dreamer_utils.py +1107 -0
sota-implementations/expert-iteration/README.md +352 -0
sota-implementations/expert-iteration/ei_utils.py +770 -0
sota-implementations/expert-iteration/expert-iteration-async.py +512 -0
sota-implementations/expert-iteration/expert-iteration-sync.py +508 -0
sota-implementations/expert-iteration/requirements_gsm8k.txt +13 -0
sota-implementations/expert-iteration/requirements_ifeval.txt +16 -0
sota-implementations/gail/gail.py +327 -0
sota-implementations/gail/gail_utils.py +68 -0
sota-implementations/gail/ppo_utils.py +157 -0
sota-implementations/grpo/README.md +273 -0
sota-implementations/grpo/grpo-async.py +437 -0
sota-implementations/grpo/grpo-sync.py +435 -0
sota-implementations/grpo/grpo_utils.py +843 -0
sota-implementations/grpo/requirements_gsm8k.txt +11 -0
sota-implementations/grpo/requirements_ifeval.txt +16 -0
sota-implementations/impala/README.md +33 -0
sota-implementations/impala/impala_multi_node_ray.py +292 -0
sota-implementations/impala/impala_multi_node_submitit.py +284 -0
sota-implementations/impala/impala_single_node.py +261 -0
sota-implementations/impala/utils.py +184 -0
sota-implementations/iql/discrete_iql.py +230 -0
sota-implementations/iql/iql_offline.py +164 -0
sota-implementations/iql/iql_online.py +225 -0
sota-implementations/iql/utils.py +437 -0
sota-implementations/multiagent/README.md +74 -0
sota-implementations/multiagent/iql.py +237 -0
sota-implementations/multiagent/maddpg_iddpg.py +266 -0
sota-implementations/multiagent/mappo_ippo.py +267 -0
sota-implementations/multiagent/qmix_vdn.py +271 -0
sota-implementations/multiagent/sac.py +337 -0
sota-implementations/multiagent/utils/__init__.py +4 -0
sota-implementations/multiagent/utils/logging.py +151 -0
sota-implementations/multiagent/utils/utils.py +43 -0
sota-implementations/ppo/README.md +29 -0
sota-implementations/ppo/ppo_atari.py +305 -0
sota-implementations/ppo/ppo_mujoco.py +293 -0
sota-implementations/ppo/utils_atari.py +238 -0
sota-implementations/ppo/utils_mujoco.py +152 -0
sota-implementations/ppo_trainer/train.py +21 -0
sota-implementations/redq/README.md +7 -0
sota-implementations/redq/redq.py +199 -0
sota-implementations/redq/utils.py +1060 -0
sota-implementations/sac/sac-async.py +266 -0
sota-implementations/sac/sac.py +239 -0
sota-implementations/sac/utils.py +381 -0
sota-implementations/sac_trainer/train.py +16 -0
sota-implementations/td3/td3.py +254 -0
sota-implementations/td3/utils.py +319 -0
sota-implementations/td3_bc/td3_bc.py +177 -0
sota-implementations/td3_bc/utils.py +251 -0
torchrl/__init__.py +144 -0
torchrl/_extension.py +74 -0
torchrl/_torchrl.cpython-314-aarch64-linux-gnu.so +0 -0
torchrl/_utils.py +1431 -0
torchrl/collectors/__init__.py +48 -0
torchrl/collectors/_base.py +1058 -0
torchrl/collectors/_constants.py +88 -0
torchrl/collectors/_multi_async.py +324 -0
torchrl/collectors/_multi_base.py +1805 -0
torchrl/collectors/_multi_sync.py +464 -0
torchrl/collectors/_runner.py +581 -0
torchrl/collectors/_single.py +2009 -0
torchrl/collectors/_single_async.py +259 -0
torchrl/collectors/collectors.py +62 -0
torchrl/collectors/distributed/__init__.py +32 -0
torchrl/collectors/distributed/default_configs.py +133 -0
torchrl/collectors/distributed/generic.py +1306 -0
torchrl/collectors/distributed/ray.py +1092 -0
torchrl/collectors/distributed/rpc.py +1006 -0
torchrl/collectors/distributed/sync.py +731 -0
torchrl/collectors/distributed/utils.py +160 -0
torchrl/collectors/llm/__init__.py +10 -0
torchrl/collectors/llm/base.py +494 -0
torchrl/collectors/llm/ray_collector.py +275 -0
torchrl/collectors/llm/utils.py +36 -0
torchrl/collectors/llm/weight_update/__init__.py +10 -0
torchrl/collectors/llm/weight_update/vllm.py +348 -0
torchrl/collectors/llm/weight_update/vllm_v2.py +311 -0
torchrl/collectors/utils.py +433 -0
torchrl/collectors/weight_update.py +591 -0
torchrl/csrc/numpy_utils.h +38 -0
torchrl/csrc/pybind.cpp +27 -0
torchrl/csrc/segment_tree.h +458 -0
torchrl/csrc/torch_utils.h +34 -0
torchrl/csrc/utils.cpp +48 -0
torchrl/csrc/utils.h +31 -0
torchrl/data/__init__.py +187 -0
torchrl/data/datasets/__init__.py +58 -0
torchrl/data/datasets/atari_dqn.py +878 -0
torchrl/data/datasets/common.py +281 -0
torchrl/data/datasets/d4rl.py +489 -0
torchrl/data/datasets/d4rl_infos.py +187 -0
torchrl/data/datasets/gen_dgrl.py +375 -0
torchrl/data/datasets/minari_data.py +643 -0
torchrl/data/datasets/openml.py +177 -0
torchrl/data/datasets/openx.py +798 -0
torchrl/data/datasets/roboset.py +363 -0
torchrl/data/datasets/utils.py +11 -0
torchrl/data/datasets/vd4rl.py +432 -0
torchrl/data/llm/__init__.py +34 -0
torchrl/data/llm/dataset.py +491 -0
torchrl/data/llm/history.py +1378 -0
torchrl/data/llm/prompt.py +198 -0
torchrl/data/llm/reward.py +225 -0
torchrl/data/llm/topk.py +186 -0
torchrl/data/llm/utils.py +543 -0
torchrl/data/map/__init__.py +21 -0
torchrl/data/map/hash.py +185 -0
torchrl/data/map/query.py +204 -0
torchrl/data/map/tdstorage.py +363 -0
torchrl/data/map/tree.py +1434 -0
torchrl/data/map/utils.py +103 -0
torchrl/data/postprocs/__init__.py +8 -0
torchrl/data/postprocs/postprocs.py +391 -0
torchrl/data/replay_buffers/__init__.py +99 -0
torchrl/data/replay_buffers/checkpointers.py +622 -0
torchrl/data/replay_buffers/ray_buffer.py +292 -0
torchrl/data/replay_buffers/replay_buffers.py +2376 -0
torchrl/data/replay_buffers/samplers.py +2578 -0
torchrl/data/replay_buffers/scheduler.py +265 -0
torchrl/data/replay_buffers/storages.py +2412 -0
torchrl/data/replay_buffers/utils.py +1042 -0
torchrl/data/replay_buffers/writers.py +781 -0
torchrl/data/tensor_specs.py +7101 -0
torchrl/data/utils.py +334 -0
torchrl/envs/__init__.py +265 -0
torchrl/envs/async_envs.py +1105 -0
torchrl/envs/batched_envs.py +3093 -0
torchrl/envs/common.py +4241 -0
torchrl/envs/custom/__init__.py +11 -0
torchrl/envs/custom/chess.py +617 -0
torchrl/envs/custom/llm.py +214 -0
torchrl/envs/custom/pendulum.py +401 -0
torchrl/envs/custom/san_moves.txt +29274 -0
torchrl/envs/custom/tictactoeenv.py +288 -0
torchrl/envs/env_creator.py +263 -0
torchrl/envs/gym_like.py +752 -0
torchrl/envs/libs/__init__.py +68 -0
torchrl/envs/libs/_gym_utils.py +326 -0
torchrl/envs/libs/brax.py +846 -0
torchrl/envs/libs/dm_control.py +544 -0
torchrl/envs/libs/envpool.py +447 -0
torchrl/envs/libs/gym.py +2239 -0
torchrl/envs/libs/habitat.py +138 -0
torchrl/envs/libs/isaac_lab.py +87 -0
torchrl/envs/libs/isaacgym.py +203 -0
torchrl/envs/libs/jax_utils.py +166 -0
torchrl/envs/libs/jumanji.py +963 -0
torchrl/envs/libs/meltingpot.py +599 -0
torchrl/envs/libs/openml.py +153 -0
torchrl/envs/libs/openspiel.py +652 -0
torchrl/envs/libs/pettingzoo.py +1042 -0
torchrl/envs/libs/procgen.py +351 -0
torchrl/envs/libs/robohive.py +429 -0
torchrl/envs/libs/smacv2.py +645 -0
torchrl/envs/libs/unity_mlagents.py +891 -0
torchrl/envs/libs/utils.py +147 -0
torchrl/envs/libs/vmas.py +813 -0
torchrl/envs/llm/__init__.py +63 -0
torchrl/envs/llm/chat.py +730 -0
torchrl/envs/llm/datasets/README.md +4 -0
torchrl/envs/llm/datasets/__init__.py +17 -0
torchrl/envs/llm/datasets/gsm8k.py +353 -0
torchrl/envs/llm/datasets/ifeval.py +274 -0
torchrl/envs/llm/envs.py +789 -0
torchrl/envs/llm/libs/README.md +3 -0
torchrl/envs/llm/libs/__init__.py +8 -0
torchrl/envs/llm/libs/mlgym.py +869 -0
torchrl/envs/llm/reward/__init__.py +10 -0
torchrl/envs/llm/reward/gsm8k.py +324 -0
torchrl/envs/llm/reward/ifeval/README.md +13 -0
torchrl/envs/llm/reward/ifeval/__init__.py +10 -0
torchrl/envs/llm/reward/ifeval/_instructions.py +1667 -0
torchrl/envs/llm/reward/ifeval/_instructions_main.py +131 -0
torchrl/envs/llm/reward/ifeval/_instructions_registry.py +100 -0
torchrl/envs/llm/reward/ifeval/_instructions_util.py +1677 -0
torchrl/envs/llm/reward/ifeval/_scorer.py +454 -0
torchrl/envs/llm/transforms/__init__.py +55 -0
torchrl/envs/llm/transforms/browser.py +292 -0
torchrl/envs/llm/transforms/dataloading.py +859 -0
torchrl/envs/llm/transforms/format.py +73 -0
torchrl/envs/llm/transforms/kl.py +1544 -0
torchrl/envs/llm/transforms/policy_version.py +189 -0
torchrl/envs/llm/transforms/reason.py +323 -0
torchrl/envs/llm/transforms/tokenizer.py +321 -0
torchrl/envs/llm/transforms/tools.py +1955 -0
torchrl/envs/model_based/__init__.py +9 -0
torchrl/envs/model_based/common.py +180 -0
torchrl/envs/model_based/dreamer.py +112 -0
torchrl/envs/transforms/__init__.py +147 -0
torchrl/envs/transforms/functional.py +48 -0
torchrl/envs/transforms/gym_transforms.py +203 -0
torchrl/envs/transforms/module.py +341 -0
torchrl/envs/transforms/r3m.py +372 -0
torchrl/envs/transforms/ray_service.py +663 -0
torchrl/envs/transforms/rb_transforms.py +214 -0
torchrl/envs/transforms/transforms.py +11835 -0
torchrl/envs/transforms/utils.py +94 -0
torchrl/envs/transforms/vc1.py +307 -0
torchrl/envs/transforms/vecnorm.py +845 -0
torchrl/envs/transforms/vip.py +407 -0
torchrl/envs/utils.py +1718 -0
torchrl/envs/vec_envs.py +11 -0
torchrl/modules/__init__.py +206 -0
torchrl/modules/distributions/__init__.py +73 -0
torchrl/modules/distributions/continuous.py +830 -0
torchrl/modules/distributions/discrete.py +908 -0
torchrl/modules/distributions/truncated_normal.py +187 -0
torchrl/modules/distributions/utils.py +233 -0
torchrl/modules/llm/__init__.py +62 -0
torchrl/modules/llm/backends/__init__.py +65 -0
torchrl/modules/llm/backends/vllm/__init__.py +94 -0
torchrl/modules/llm/backends/vllm/_models.py +46 -0
torchrl/modules/llm/backends/vllm/base.py +72 -0
torchrl/modules/llm/backends/vllm/vllm_async.py +2075 -0
torchrl/modules/llm/backends/vllm/vllm_plugin.py +22 -0
torchrl/modules/llm/backends/vllm/vllm_sync.py +446 -0
torchrl/modules/llm/backends/vllm/vllm_utils.py +129 -0
torchrl/modules/llm/policies/__init__.py +28 -0
torchrl/modules/llm/policies/common.py +1809 -0
torchrl/modules/llm/policies/transformers_wrapper.py +2756 -0
torchrl/modules/llm/policies/vllm_wrapper.py +2241 -0
torchrl/modules/llm/utils.py +23 -0
torchrl/modules/mcts/__init__.py +21 -0
torchrl/modules/mcts/scores.py +579 -0
torchrl/modules/models/__init__.py +86 -0
torchrl/modules/models/batchrenorm.py +119 -0
torchrl/modules/models/decision_transformer.py +179 -0
torchrl/modules/models/exploration.py +731 -0
torchrl/modules/models/llm.py +156 -0
torchrl/modules/models/model_based.py +596 -0
torchrl/modules/models/models.py +1712 -0
torchrl/modules/models/multiagent.py +1067 -0
torchrl/modules/models/recipes/impala.py +185 -0
torchrl/modules/models/utils.py +162 -0
torchrl/modules/planners/__init__.py +10 -0
torchrl/modules/planners/cem.py +228 -0
torchrl/modules/planners/common.py +73 -0
torchrl/modules/planners/mppi.py +265 -0
torchrl/modules/tensordict_module/__init__.py +89 -0
torchrl/modules/tensordict_module/actors.py +2457 -0
torchrl/modules/tensordict_module/common.py +529 -0
torchrl/modules/tensordict_module/exploration.py +814 -0
torchrl/modules/tensordict_module/probabilistic.py +321 -0
torchrl/modules/tensordict_module/rnn.py +1639 -0
torchrl/modules/tensordict_module/sequence.py +132 -0
torchrl/modules/tensordict_module/world_models.py +34 -0
torchrl/modules/utils/__init__.py +38 -0
torchrl/modules/utils/mappings.py +9 -0
torchrl/modules/utils/utils.py +89 -0
torchrl/objectives/__init__.py +78 -0
torchrl/objectives/a2c.py +659 -0
torchrl/objectives/common.py +753 -0
torchrl/objectives/cql.py +1346 -0
torchrl/objectives/crossq.py +710 -0
torchrl/objectives/ddpg.py +453 -0
torchrl/objectives/decision_transformer.py +371 -0
torchrl/objectives/deprecated.py +516 -0
torchrl/objectives/dqn.py +683 -0
torchrl/objectives/dreamer.py +488 -0
torchrl/objectives/functional.py +48 -0
torchrl/objectives/gail.py +258 -0
torchrl/objectives/iql.py +996 -0
torchrl/objectives/llm/__init__.py +30 -0
torchrl/objectives/llm/grpo.py +846 -0
torchrl/objectives/llm/sft.py +482 -0
torchrl/objectives/multiagent/__init__.py +8 -0
torchrl/objectives/multiagent/qmixer.py +396 -0
torchrl/objectives/ppo.py +1669 -0
torchrl/objectives/redq.py +683 -0
torchrl/objectives/reinforce.py +530 -0
torchrl/objectives/sac.py +1580 -0
torchrl/objectives/td3.py +570 -0
torchrl/objectives/td3_bc.py +625 -0
torchrl/objectives/utils.py +782 -0
torchrl/objectives/value/__init__.py +28 -0
torchrl/objectives/value/advantages.py +1956 -0
torchrl/objectives/value/functional.py +1459 -0
torchrl/objectives/value/utils.py +360 -0
torchrl/record/__init__.py +17 -0
torchrl/record/loggers/__init__.py +23 -0
torchrl/record/loggers/common.py +48 -0
torchrl/record/loggers/csv.py +226 -0
torchrl/record/loggers/mlflow.py +142 -0
torchrl/record/loggers/tensorboard.py +139 -0
torchrl/record/loggers/trackio.py +163 -0
torchrl/record/loggers/utils.py +78 -0
torchrl/record/loggers/wandb.py +214 -0
torchrl/record/recorder.py +554 -0
torchrl/services/__init__.py +79 -0
torchrl/services/base.py +109 -0
torchrl/services/ray_service.py +453 -0
torchrl/testing/__init__.py +107 -0
torchrl/testing/assertions.py +179 -0
torchrl/testing/dist_utils.py +122 -0
torchrl/testing/env_creators.py +227 -0
torchrl/testing/env_helper.py +35 -0
torchrl/testing/gym_helpers.py +156 -0
torchrl/testing/llm_mocks.py +119 -0
torchrl/testing/mocking_classes.py +2720 -0
torchrl/testing/modules.py +295 -0
torchrl/testing/mp_helpers.py +15 -0
torchrl/testing/ray_helpers.py +293 -0
torchrl/testing/utils.py +190 -0
torchrl/trainers/__init__.py +42 -0
torchrl/trainers/algorithms/__init__.py +11 -0
torchrl/trainers/algorithms/configs/__init__.py +705 -0
torchrl/trainers/algorithms/configs/collectors.py +216 -0
torchrl/trainers/algorithms/configs/common.py +41 -0
torchrl/trainers/algorithms/configs/data.py +308 -0
torchrl/trainers/algorithms/configs/envs.py +104 -0
torchrl/trainers/algorithms/configs/envs_libs.py +361 -0
torchrl/trainers/algorithms/configs/logging.py +80 -0
torchrl/trainers/algorithms/configs/modules.py +570 -0
torchrl/trainers/algorithms/configs/objectives.py +177 -0
torchrl/trainers/algorithms/configs/trainers.py +340 -0
torchrl/trainers/algorithms/configs/transforms.py +955 -0
torchrl/trainers/algorithms/configs/utils.py +252 -0
torchrl/trainers/algorithms/configs/weight_sync_schemes.py +191 -0
torchrl/trainers/algorithms/configs/weight_update.py +159 -0
torchrl/trainers/algorithms/ppo.py +373 -0
torchrl/trainers/algorithms/sac.py +308 -0
torchrl/trainers/helpers/__init__.py +40 -0
torchrl/trainers/helpers/collectors.py +416 -0
torchrl/trainers/helpers/envs.py +573 -0
torchrl/trainers/helpers/logger.py +33 -0
torchrl/trainers/helpers/losses.py +132 -0
torchrl/trainers/helpers/models.py +658 -0
torchrl/trainers/helpers/replay_buffer.py +59 -0
torchrl/trainers/helpers/trainers.py +301 -0
torchrl/trainers/trainers.py +2052 -0
torchrl/weight_update/__init__.py +33 -0
torchrl/weight_update/_distributed.py +749 -0
torchrl/weight_update/_mp.py +624 -0
torchrl/weight_update/_noupdate.py +102 -0
torchrl/weight_update/_ray.py +1032 -0
torchrl/weight_update/_rpc.py +284 -0
torchrl/weight_update/_shared.py +891 -0
torchrl/weight_update/llm/__init__.py +32 -0
torchrl/weight_update/llm/vllm_double_buffer.py +370 -0
torchrl/weight_update/llm/vllm_nccl.py +710 -0
torchrl/weight_update/utils.py +73 -0
torchrl/weight_update/weight_sync_schemes.py +1244 -0
torchrl-0.11.0.dist-info/METADATA +1308 -0
torchrl-0.11.0.dist-info/RECORD +394 -0
torchrl-0.11.0.dist-info/WHEEL +5 -0
torchrl-0.11.0.dist-info/entry_points.txt +2 -0
torchrl-0.11.0.dist-info/licenses/LICENSE +21 -0
torchrl-0.11.0.dist-info/top_level.txt +7 -0

sota-implementations/grpo/grpo_utils.py ADDED Viewed

@@ -0,0 +1,843 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+import functools
+import time
+import warnings
+from typing import Any, Literal
+import torch
+from omegaconf import DictConfig
+from torch import device as torch_device, dtype as torch_dtype
+from torchrl._utils import logger as torchrl_logger, timeit
+from torchrl.envs.llm import AddThinkingPrompt, GSM8KEnv, KLRewardTransform, RetrieveKL
+from torchrl.envs.llm.datasets.ifeval import IFEvalEnv
+from torchrl.modules.llm import TransformersWrapper, vLLMWrapper
+from torchrl.weight_update.llm import VLLMWeightSyncScheme
+from transformers.models.auto.modeling_auto import AutoModelForCausalLM
+from transformers.tokenization_utils import PreTrainedTokenizer
+def check_grpo_dependencies() -> None:
+    """Check for required GRPO dependencies and provide helpful error messages.
+    This function checks for critical dependencies needed for GRPO training and
+    provides installation instructions for missing packages.
+    """
+    missing_packages = []
+    missing_optional = []
+    # Core required packages
+    required_packages = {
+        "datasets": "pip install datasets",
+        "peft": "pip install peft",
+        "wandb": "pip install wandb",
+        "vllm": "pip install vllm",
+        "transformers": "pip install transformers",
+        "accelerate": "pip install accelerate",
+        "ray": "pip install ray",
+        "tqdm": "pip install tqdm",
+    }
+    # Optional but recommended packages
+    optional_packages = {
+        "flash_attn": "pip install flash-attn",
+        "bitsandbytes": "pip install bitsandbytes",
+        "xformers": "pip install xformers",
+    }
+    # Check required packages
+    for package, install_cmd in required_packages.items():
+        try:
+            __import__(package)
+        except ImportError:
+            missing_packages.append((package, install_cmd))
+    # Check optional packages
+    for package, install_cmd in optional_packages.items():
+        try:
+            __import__(package)
+        except ImportError:
+            missing_optional.append((package, install_cmd))
+    # Report missing required packages
+    if missing_packages:
+        error_msg = (
+            "Missing required packages for GRPO training:\n"
+            + "\n".join(f"  - {pkg}: {cmd}" for pkg, cmd in missing_packages)
+            + "\n\nYou can install all GRPO dependencies with:\n"
+            + "  pip install torchrl[grpo]\n"
+            + "or install individual packages as shown above."
+        )
+        raise ImportError(error_msg)
+    # Report missing optional packages as warnings
+    if missing_optional:
+        warning_msg = (
+            "Missing optional packages that may improve GRPO performance:\n"
+            + "\n".join(f"  - {pkg}: {cmd}" for pkg, cmd in missing_optional)
+            + "\n\nThese packages are optional but recommended for optimal performance."
+        )
+        warnings.warn(warning_msg, UserWarning, stacklevel=2)
+    torchrl_logger.info("✓ All required GRPO dependencies are available")
+def get_tokenizer(cfg: DictConfig) -> PreTrainedTokenizer:
+    from transformers import AutoTokenizer
+    model_name = cfg.model.name
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # tokenizer.eos_token = "<|im_end|>"
+    if tokenizer.pad_token == tokenizer.eos_token:
+        tokenizer.pad_token = "PAD"
+    tokenizer.padding_side = "left"
+    return tokenizer
+def get_train_model(
+    cfg: DictConfig,
+    devices: list[int] | None = None,
+) -> tuple[TransformersWrapper, PreTrainedTokenizer]:
+    """Creates and configures the training model with LoRA adapters.
+    This function initializes the main training model with LoRA adapters and other
+    training-specific configurations like gradient checkpointing. The model is wrapped
+    in a TransformersWrapper for policy training.
+    Args:
+        cfg (DictConfig): The hydra configuration object containing model and training settings.
+            Expected to have train_model section with LoRA, quantization, and other
+            training-specific parameters.
+    Returns:
+        tuple[TransformersWrapper, PreTrainedTokenizer]:
+            - policy_training: The wrapped training model
+            - train_tokenizer: The tokenizer for the model
+    Raises:
+        RuntimeError: If CUDA is not available or if device allocation fails
+    """
+    torchrl_logger.info("Creating train model")
+    # Set model dtype explicitly
+    model_dtype = getattr(torch, cfg.train_model.torch_dtype)
+    # Get configured devices or default to [0]
+    train_devices = devices if devices is not None else [0]
+    # Create max_memory dict - set 0 memory for GPUs we don't want to use
+    max_memory = {}
+    for i in range(torch.cuda.device_count()):
+        if i in train_devices:
+            max_memory[i] = "24GiB"  # Allow max memory for devices we want to use
+        else:
+            max_memory[i] = "0GiB"  # No memory for other devices
+    max_memory["cpu"] = "24GiB"  # Allow CPU memory as fallback
+    # Let HF handle distribution with max_memory
+    device_map = "balanced" if len(train_devices) > 1 else f"cuda:{train_devices[0]}"
+    train_model, train_tokenizer = get_hf_model(
+        cfg.model.name,
+        device_map=device_map,
+        max_memory=max_memory,
+        lora=cfg.train_model.lora.enabled,
+        lora_r=cfg.train_model.lora.r,
+        lora_alpha=cfg.train_model.lora.alpha,
+        lora_dropout=cfg.train_model.lora.dropout,
+        gradient_checkpointing=cfg.train_model.gradient_checkpointing,
+        quantize=cfg.train_model.quantization.enabled,
+        torch_dtype=model_dtype,
+        attn_implementation=cfg.train_model.attn_implementation,
+        compile=cfg.model.compile,
+    )
+    # Force all model parameters to the same dtype
+    for param in train_model.parameters():
+        param.data = param.data.to(model_dtype)
+    policy_training = TransformersWrapper(
+        train_model,
+        tokenizer=train_tokenizer,
+        input_mode="tokens" if not cfg.env.reasoning else "history",
+        generate=False,
+        return_log_probs=True,
+        pad_output=False,
+        device=torch.device("cuda:0"),
+        # Enable packing when cfg.train.packing=True by disabling padding
+        pad_model_input=not cfg.train.packing,
+    )
+    # Ensure model stays in eval mode after wrapping
+    policy_training.model.eval()
+    policy_training.model.train(False)
+    return policy_training, train_tokenizer
+def get_inference_model(
+    cfg: DictConfig,
+    devices: list[int] | None = None,
+    make_ray_worker: bool = True,
+    tokenizer: PreTrainedTokenizer | None = None,
+) -> vLLMWrapper:
+    """Creates the vLLM-based inference model for fast generation.
+    This function initializes a vLLM model server for efficient inference and wraps
+    it in a vLLMWrapper for policy inference. vLLM provides optimized generation
+    with better throughput than standard HuggingFace generation.
+    Args:
+        cfg (DictConfig): The hydra configuration object containing model settings.
+            Expected to have inference_model section with vLLM-specific parameters
+            like gpu_memory_utilization and generation settings.
+        devices (list[int], optional): The devices to use for the inference model. Default: `None`.
+        make_ray_worker (bool, optional): Whether to make a ray worker. Default: `True`.
+        tokenizer (PreTrainedTokenizer, optional): The tokenizer to use with the inference model. Default: `None`.
+    Returns:
+        vLLMWrapper: The wrapped vLLM model ready for inference.
+    Raises:
+        AssertionError: If the vLLM server or model initialization fails
+    """
+    from torchrl.modules.llm.backends.vllm import AsyncVLLM
+    num_devices = cfg.inference_model.num_devices
+    if num_devices is None:
+        vllm_devices = devices if devices is not None else [1]
+        num_devices = len(vllm_devices)
+    else:
+        vllm_devices = None
+    torchrl_logger.info(
+        f"Creating AsyncVLLM inference model with num_devices={num_devices}, devices={vllm_devices}"
+    )
+    model_name = cfg.model.name
+    # Use AsyncVLLM for better performance and async processing
+    verbose = getattr(cfg.inference_model, "verbose", True)
+    compile_model = getattr(
+        cfg.inference_model, "compile", False
+    )  # Disabled by default for GRPO
+    # Build parameters dict for AsyncVLLM with all config options
+    inference_params = {
+        "model_name": model_name,
+        "num_devices": 1,
+        "num_replicas": num_devices,
+        "gpu_memory_utilization": cfg.inference_model.gpu_memory_utilization,
+        "enforce_eager": cfg.inference_model.enforce_eager,
+        "verbose": verbose,
+        "compile": compile_model,
+    }
+    # CRITICAL FIX: Configure attention implementation to prevent Flash Attention errors
+    # vLLM doesn't accept attn_implementation directly through AsyncEngineArgs
+    # Instead, we set the VLLM_ATTENTION_BACKEND environment variable
+    if hasattr(cfg.inference_model, "attn_implementation"):
+        import os
+        attn_impl = cfg.inference_model.attn_implementation
+        # Map common attention implementations to vLLM backend names
+        attn_backend_map = {
+            "flash_attention_2": "FLASH_ATTN",
+            "flash_attn": "FLASH_ATTN",
+            "sdpa": "TORCH_SDPA",
+            "torch_sdpa": "TORCH_SDPA",
+            "xformers": "XFORMERS",
+        }
+        vllm_backend = attn_backend_map.get(attn_impl, attn_impl.upper())
+        os.environ["VLLM_ATTENTION_BACKEND"] = vllm_backend
+        torchrl_logger.info(
+            f"Setting VLLM_ATTENTION_BACKEND={vllm_backend} (from config: {attn_impl})"
+        )
+    # Handle FP32 output configuration
+    if hasattr(cfg.inference_model, "enable_fp32_output"):
+        enable_fp32 = cfg.inference_model.enable_fp32_output
+        if enable_fp32:
+            os.environ["VLLM_ENABLE_FP32_OUTPUT"] = "1"
+            torchrl_logger.info(
+                "Enabled FP32 output for vLLM (VLLM_ENABLE_FP32_OUTPUT=1). "
+                "This will use FP32 for the final output layer if the model supports it."
+            )
+        # Add to inference params so it gets passed to AsyncVLLM
+        inference_params["enable_fp32_output"] = enable_fp32
+    # Add other common vLLM parameters from config if present
+    optional_vllm_params = [
+        "max_model_len",
+        "dtype",
+        "trust_remote_code",
+        "seed",
+        "swap_space",
+        "cpu_offload_gb",
+        "enable_prefix_caching",
+        "tensor_parallel_size",
+        "pipeline_parallel_size",
+    ]
+    for param in optional_vllm_params:
+        if hasattr(cfg.inference_model, param):
+            value = getattr(cfg.inference_model, param)
+            if value is not None:
+                inference_params[param] = value
+    # Handle torch_dtype specifically (convert string to torch dtype)
+    if hasattr(cfg.inference_model, "torch_dtype"):
+        dtype_str = cfg.inference_model.torch_dtype
+        if dtype_str is not None:
+            if isinstance(dtype_str, str):
+                inference_params["dtype"] = getattr(torch, dtype_str)
+            else:
+                inference_params["dtype"] = dtype_str
+    inference_server = AsyncVLLM.from_pretrained(**inference_params)
+    assert inference_server is not None
+    if tokenizer is None:
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        if tokenizer.pad_token == tokenizer.eos_token:
+            tokenizer.pad_token = "PAD"
+        tokenizer.padding_side = "left"
+    policy = vLLMWrapper(
+        inference_server,
+        input_mode="history",
+        chat_template_name="qwen",
+        return_log_probs=not cfg.env.reasoning,
+        tokenizer=tokenizer,
+        pad_output=False,
+        generate_kwargs={
+            "max_tokens": cfg.inference_model.max_tokens,
+            "include_stop_str_in_output": cfg.inference_model.include_stop_str_in_output,
+            "temperature": cfg.inference_model.temperature,
+            "top_p": cfg.inference_model.top_p,
+        },
+    )
+    assert policy.model is not None
+    return policy
+def get_ref_model(
+    cfg: DictConfig,
+    tokenizer: PreTrainedTokenizer,
+    devices: list[int] | None = None,
+) -> TransformersWrapper:
+    """Creates the reference model for KL penalty computation.
+    This function initializes a frozen copy of the base model to serve as the
+    reference model for KL divergence computation. The reference model is typically
+    quantized and does not require gradient computation.
+    Args:
+        cfg (DictConfig): The hydra configuration object containing model settings.
+            Expected to have ref_model section with quantization and attention settings.
+        tokenizer (PreTrainedTokenizer): The tokenizer to use with the reference model.
+    Returns:
+        TransformersWrapper: The wrapped reference model in eval mode with detached weights.
+    """
+    from tensordict import TensorDict
+    torchrl_logger.info("Creating ref model")
+    # Get configured devices or default to [2]
+    if cfg.ref_model.num_devices is None:
+        ref_devices = devices if devices is not None else [2]
+    else:
+        ref_devices = list(range(cfg.ref_model.num_devices))
+    # Create max_memory dict - set 0 memory for GPUs we don't want to use
+    max_memory = {}
+    for i in range(torch.cuda.device_count()):
+        if i in ref_devices:
+            max_memory[i] = "24GiB"  # Allow max memory for devices we want to use
+        else:
+            max_memory[i] = "0GiB"  # No memory for other devices
+    max_memory["cpu"] = "24GiB"  # Allow CPU memory as fallback
+    # Let HF handle distribution with max_memory
+    device_map = "balanced" if len(ref_devices) > 1 else f"cuda:{ref_devices[0]}"
+    model_name = cfg.model.name
+    ref_model = get_hf_model(
+        model_name,
+        device_map=device_map,
+        max_memory=max_memory,
+        torch_dtype=getattr(torch, cfg.ref_model.torch_dtype),
+        quantize=cfg.ref_model.quantization.enabled,
+        gradient_checkpointing=cfg.ref_model.gradient_checkpointing,
+        attn_implementation=cfg.ref_model.attn_implementation,
+        lora=False,  # Reference model doesn't need LoRA
+        requires_grad=False,
+    )[0].eval()
+    # Detach weights
+    TensorDict.from_module(ref_model).data.to_module(ref_model)
+    ref_model = TransformersWrapper(
+        ref_model,
+        input_mode="tokens" if not cfg.env.reasoning else "history",
+        tokenizer=tokenizer,
+        generate=False,
+        return_log_probs=True,
+        pad_output=False,
+        device=torch.device("cuda:0"),
+    )
+    return ref_model
+def get_hf_model(
+    model_name: str,
+    torch_dtype: torch_dtype = torch.float32,
+    lora_r: int = 8,
+    lora_alpha: int = 16,
+    lora_dropout: float = 0.1,
+    quantize: bool = False,
+    fsdp: str = "",
+    fsdp_config: Any = None,
+    gradient_checkpointing: bool = True,
+    device_map: str
+    | dict[str, int | str | torch_device]
+    | int
+    | torch_device
+    | None = None,
+    lora: bool = True,
+    attn_implementation: Literal["flash_attention_2", "flex_attention", "sdpa"]
+    | None = "flex_attention",
+    requires_grad: bool = True,
+    compile: bool = False,
+    max_memory: dict[str, str] | None = None,
+) -> tuple[AutoModelForCausalLM, PreTrainedTokenizer]:
+    """Creates and configures a HuggingFace model with optional optimizations.
+    Args:
+        model_name (str): HuggingFace model identifier (e.g., "Qwen/Qwen2.5-3B")
+        torch_dtype (torch.dtype, optional): Model precision. Default: torch.float32
+        lora_r (int, optional): LoRA rank - controls capacity of adaptations. Default: 8
+        lora_alpha (int, optional): LoRA alpha - scales the adaptations. Default: 16
+        lora_dropout (float, optional): Dropout probability for LoRA layers. Default: 0.1
+        quantize (bool, optional): Whether to enable 4-bit quantization. Default: False
+        fsdp (str, optional): Fully Sharded Data Parallel configuration. Default: ""
+        fsdp_config (Any, optional): Additional FSDP configurations. Default: None
+        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Default: True
+        device_map (str | dict | int | torch.device | None, optional): Device placement strategy. Default: None
+        lora (bool, optional): Whether to apply LoRA adapters. Default: True
+        attn_implementation (Literal["flash_attention_2", "flex_attention", "sdpa"] | None, optional):
+            Attention implementation to use. Default: "flex_attention"
+        requires_grad (bool, optional): Whether to enable gradient computation. Default: True
+        compile (bool, optional): Whether to enable model compilation. Default: False
+        max_memory (dict[str, str], optional): Memory configuration for distributed training. Default: {}
+    Returns:
+        tuple[AutoModelForCausalLM, PreTrainedTokenizer]:
+            - model: The configured HuggingFace model
+            - tokenizer: The associated tokenizer
+    Raises:
+        ImportError: If required dependencies are not installed
+        RuntimeError: If model initialization fails
+    """
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    if max_memory is None:
+        max_memory = {}
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # tokenizer.eos_token = "<|im_end|>"
+    if tokenizer.pad_token == tokenizer.eos_token:
+        tokenizer.pad_token = "PAD"
+    tokenizer.padding_side = "left"
+    # Configure model settings for mixed precision
+    # Store original dtype to restore it later
+    original_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(torch_dtype)
+    model_configs = {
+        "torch_dtype": torch_dtype,
+        "device_map": device_map if device_map is not None else "auto",
+        "max_memory": max_memory,
+    }
+    if torch.cuda.is_available() and attn_implementation:
+        torchrl_logger.info(f"{attn_implementation} init")
+        model_configs["attn_implementation"] = attn_implementation
+    try:
+        # Configure training settings based on FSDP usage
+        if fsdp != "" and fsdp_config is not None:
+            torchrl_logger.info("Configurations for FSDP")
+            bnb_config_params = {"bnb_4bit_quant_storage": torch_dtype}
+        else:
+            bnb_config_params = {}
+        # Enable Quantization
+        if quantize:
+            try:
+                from transformers.utils.quantization_config import BitsAndBytesConfig
+            except ImportError:
+                raise ImportError(
+                    "Please install transformers with bitsandbytes support"
+                )
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch_dtype,
+                **bnb_config_params,
+            )
+            model_configs["quantization_config"] = bnb_config
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            use_cache=not gradient_checkpointing,
+            cache_dir="/tmp/.cache",
+            **model_configs,
+        )
+        # Configure gradient checkpointing based on FSDP usage
+        if fsdp == "" and fsdp_config is None:
+            if gradient_checkpointing:
+                torchrl_logger.info("gradient_checkpointing enabled")
+                model.gradient_checkpointing_enable()
+        else:
+            if gradient_checkpointing:
+                torchrl_logger.info("gradient_checkpointing enabled")
+                model.gradient_checkpointing_enable(
+                    gradient_checkpointing_kwargs={"use_reentrant": False}
+                )
+        if lora:
+            try:
+                from peft import get_peft_model, LoraConfig
+            except ImportError:
+                raise ImportError("Please install peft: pip install peft")
+            # Create LoRA config with explicit dtype setting
+            lora_config = LoraConfig(
+                r=lora_r,
+                lora_alpha=lora_alpha,
+                target_modules="all-linear",
+                lora_dropout=0.0,  # Disable dropout for RL training
+                bias="none",
+                task_type="CAUSAL_LM",
+                inference_mode=True,  # Force inference mode for consistent behavior
+                init_lora_weights=True,  # This ensures weights are initialized
+            )
+            # Initialize LoRA model
+            model = get_peft_model(
+                model,
+                lora_config,
+                autocast_adapter_dtype=False,  # Prevent automatic casting of adapter layers
+            )
+            # Force LoRA layers to correct dtype and eval mode
+            for n, p in model.named_parameters():
+                if "lora_" in n:  # Only convert LoRA parameters
+                    p.data = p.data.to(torch_dtype)
+        model.eval()  # Ensure model is in eval mode
+        if requires_grad:
+            model.requires_grad_(True)
+        return model, tokenizer
+    finally:
+        # Restore original dtype
+        torch.set_default_dtype(original_dtype)
+def make_weight_sync_scheme(
+    vllm_engine,
+) -> VLLMWeightSyncScheme:
+    """Creates a vLLM weight synchronization scheme using NCCL collectives.
+    This function creates a weight sync scheme that uses NCCL for high-performance
+    GPU-to-GPU weight transfers from the training model to vLLM inference workers.
+    Args:
+        vllm_engine: A vLLM engine implementing the RLvLLMEngine interface
+            (like RayLLMWorker, LocalLLMWrapper, or AsyncVLLM).
+            This is typically obtained from the inference policy's model attribute.
+    Returns:
+        VLLMWeightSyncScheme: A weight sync scheme configured for the vLLM engine.
+    """
+    # Get configuration from the vLLM engine
+    tp_size = vllm_engine.get_tp_size()
+    num_replicas = getattr(vllm_engine, "num_replicas", 1)
+    master_address = vllm_engine.get_master_address()
+    master_port = vllm_engine.get_master_port()
+    torchrl_logger.info(
+        f"Creating VLLMWeightSyncScheme with tp_size={tp_size}, "
+        f"num_replicas={num_replicas}, master_address={master_address}, "
+        f"master_port={master_port}"
+    )
+    return VLLMWeightSyncScheme(
+        master_address=master_address,
+        master_port=master_port,
+        gpus_per_replica=tp_size,
+        num_replicas=num_replicas,
+        strategy="state_dict",
+    )
+def compute_device_allocation(cfg):
+    """Compute device allocations and Ray GPU config.
+    Args:
+        cfg: The configuration object
+    Returns:
+        dict: Updated device configuration containing:
+            - train_model_devices: list of devices for training
+            - inference_model_devices: list of devices for inference
+            - ray_num_gpus: number of GPUs to tell Ray about
+            - cuda_visible_devices: string for CUDA_VISIBLE_DEVICES
+    """
+    train_devices = cfg.train_model.num_devices
+    inf_devices = cfg.inference_model.num_devices
+    train_start = 0
+    train_end = train_devices
+    inference_start = 0
+    inference_end = inf_devices
+    ref_devices = cfg.ref_model.num_devices if cfg.train.use_kl_to_ref else 0
+    ray_num_gpus = train_devices + inf_devices + ref_devices
+    train_model_devices = list(range(train_start, train_end))
+    inference_model_devices = list(range(inference_start, inference_end))
+    all_devices = sorted(set(train_model_devices + inference_model_devices))
+    if cfg.train.use_kl_to_ref:
+        ref_device_start = max(all_devices) + 1 if all_devices else 0
+        ref_devices_list = list(range(ref_device_start, ref_device_start + ref_devices))
+        all_devices.extend(ref_devices_list)
+    cuda_visible_devices = ",".join(map(str, all_devices))
+    return {
+        "train_model_devices": train_model_devices,
+        "inference_model_devices": inference_model_devices,
+        "ray_num_gpus": ray_num_gpus,
+        "cuda_visible_devices": cuda_visible_devices,
+    }
+def make_env(cfg: DictConfig, single_env: bool = False):
+    """Create the environment.
+    Args:
+        cfg: The configuration object
+    Returns:
+        The configured environment
+    """
+    train_tokenizer = get_tokenizer(cfg)
+    # Setup environment
+    max_steps = cfg.env.max_steps if cfg.env.reasoning else 1
+    if cfg.env.dataset == "gsm8k":
+        # Reward scale is 0.0 to 100
+        reward_threshold = 20
+        env = GSM8KEnv(
+            repeats=cfg.env.repeats,
+            tokenizer=train_tokenizer,
+            num_envs=cfg.env.num_envs if not single_env else 1,
+            max_steps=max_steps,
+            device=torch.device("cpu"),
+            ray_backend=True,
+        )
+    elif cfg.env.dataset == "ifeval":  # ifeval
+        # Reward scale is 0.0 to 2.2
+        reward_threshold = 1.0
+        env = IFEvalEnv(
+            repeats=cfg.env.repeats,
+            tokenizer=train_tokenizer,
+            num_envs=cfg.env.num_envs if not single_env else 1,
+            max_steps=max_steps,
+            device=torch.device("cpu"),
+            ray_backend=True,
+        )
+    else:
+        raise NotImplementedError(f"Dataset {cfg.env.dataset} not implemented")
+    if cfg.env.reasoning:
+        env = env.append_transform(
+            AddThinkingPrompt(
+                cond=lambda td, reward_threshol=reward_threshold, max_steps=max_steps: td[
+                    "reward"
+                ]
+                <= reward_threshold
+                and td["step_count"] < max_steps,
+                role="user",
+                edit_last_turn=False,
+                zero_reward=False,
+                undo_done=True,
+                random_prompt=True,
+            ),
+        )
+    return env
+def make_ref_model_factory(cfg: DictConfig) -> functools.partial | None:
+    """Create a factory for the reference model if KL to ref is enabled.
+    Args:
+        cfg: The configuration object
+    Returns:
+        A partial function that creates the reference model, or None if KL to ref is disabled
+    """
+    if not cfg.train.use_kl_to_ref:
+        return None
+    train_tokenizer = get_tokenizer(cfg)
+    ref_cfg = DictConfig(dict(cfg))
+    ref_model_factory = functools.partial(
+        get_ref_model,
+        ref_cfg,
+        train_tokenizer,
+        devices=[0],
+    )
+    return ref_model_factory
+def add_kl_transforms_to_replay_buffer(replay_buffer, cfg: DictConfig):
+    """Add KL transforms to replay buffer.
+    Args:
+        replay_buffer: The replay buffer to add transforms to
+        cfg: The configuration object
+    """
+    if not cfg.train.use_kl_to_ref:
+        return
+    ref_model_factory = make_ref_model_factory(cfg)
+    if ref_model_factory is None:
+        return
+    if cfg.env.reasoning:
+        kl_transform = RetrieveKL(
+            ref_model_factory=ref_model_factory,
+            add_to_reward=not cfg.train.kl_coef_in_loss,
+            coeff=cfg.train.kl_to_ref_coeff,
+            use_ray_service=True,
+        )
+    else:
+        kl_transform = KLRewardTransform(
+            ref_model_factory=ref_model_factory,
+            coef=cfg.train.kl_to_ref_coeff,
+            add_to_reward=not cfg.train.kl_coef_in_loss,
+            device=torch.device("cuda:0"),
+            use_ray_service=True,
+        )
+    replay_buffer.append_transform(kl_transform, invert=True)
+@timeit("Logging metrics")
+def log_training_metrics(
+    wandb_logger,
+    replay_buffer,
+    batch,
+    loss,
+    grad_norm,
+    global_step,
+    data_read_count,
+    collector,
+    start_time,
+    gradient_accumulation_steps,
+    history_str=None,
+    use_kl_to_ref=True,
+):
+    """Log training metrics to wandb.
+    Args:
+        wandb_logger: The wandb logger instance
+        replay_buffer: The replay buffer containing collected data
+        batch: The current training batch
+        loss: The computed loss object
+        grad_norm: The gradient norm value
+        global_step: Current global training step
+        data_read_count: Total data read count
+        collector: The collector instance
+        start_time: Training start time
+        gradient_accumulation_steps: Number of gradient accumulation steps
+        history_str: Optional history string for logging
+    """
+    with torch.no_grad():
+        rb_content = replay_buffer[:]
+        step_count = rb_content.get(("next", "step_count")).view(-1).float().mean()
+        batch_policy_version = batch["next", "policy_version"].view(-1).min()
+        batch_policy_age = collector.policy_version - batch_policy_version
+        metrics = {
+            "step_count from buffer": float(step_count),
+            "reward from buffer": float(
+                torch.cat(rb_content.get(("next", "reward"), as_list=True)).mean()
+            ),
+            "seq_length from buffer": float(
+                torch.tensor(
+                    [
+                        t.numel()
+                        for t in rb_content.get(("tokens", "response"), as_list=True)
+                    ],
+                    dtype=torch.float,
+                ).mean()
+            ),
+            "ESS, from loss": float(loss.ESS),
+            "loss_objective, from loss": float(loss.loss_objective),
+            "clip_fraction, from loss": float(loss.clip_fraction),
+            "kl_approx (train to inference), from loss": float(loss.kl_approx),
+            "kl_to_inference (train to inference - differentiable), from loss": float(
+                loss.kl_to_inference.mean()
+            ),
+            "loss_kl_to_inference, from loss": float(loss.loss_kl_to_inference.mean()),
+            "entropy loss, from loss": float(loss.loss_entropy.mean()),
+            "grad_norm": float(grad_norm)
+            if global_step % gradient_accumulation_steps == 0
+            else 0.0,
+            "write_count, from buffer": int(replay_buffer.write_count),
+            # how many gradient steps per write
+            "gradient_step_throughput (gradient step per write)": float(
+                global_step / replay_buffer.write_count
+            ),
+            # how many optim steps per write
+            "optim_step_throughput (optim step per write)": float(
+                (global_step // gradient_accumulation_steps) / replay_buffer.write_count
+            ),
+            "data_read_count (total)": data_read_count,
+            "current_policy_version (collector)": collector.policy_version,
+            # FIXME: Assume batch is a single trajectory
+            # FIXME: The addition of the transform after the env instantiation + _shuttle creation
+            #  is messed up - we need the next data
+            "batch_policy_version (sampled batch)": batch_policy_version,
+            "batch_policy_age (sampled batch)": batch_policy_age,
+            "throughput (steps per second)": float(
+                global_step / (time.time() - start_time)
+            ),
+        }
+        if use_kl_to_ref:
+            metrics["kl_penalty (inference to ref) from buffer"] = float(
+                torch.cat(rb_content.get(("next", "kl_penalty"), as_list=True)).mean()
+            )
+            metrics["kl_to_ref, from loss"] = float(loss.kl_to_ref.mean())
+            metrics["loss_kl_to_ref, from loss"] = float(loss.loss_kl_to_ref.mean())
+        for name, value in metrics.items():
+            wandb_logger.log_scalar(name, value, step=global_step)
+        if history_str is not None:
+            wandb_logger.log_str("history", history_str, step=global_step)