PyPI - torchrl - Versions diffs - 0.11.0__cp314-cp314t-macosx_11_0_arm64.whl - Mend

torchrl 0.11.0__cp314-cp314t-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (395) hide show

benchmarks/benchmark_batched_envs.py +104 -0
benchmarks/conftest.py +91 -0
benchmarks/ecosystem/gym_env_throughput.py +321 -0
benchmarks/ecosystem/vmas_rllib_vs_torchrl_sampling_performance.py +231 -0
benchmarks/requirements.txt +7 -0
benchmarks/storage/benchmark_sample_latency_over_rpc.py +193 -0
benchmarks/test_collectors_benchmark.py +240 -0
benchmarks/test_compressed_storage_benchmark.py +145 -0
benchmarks/test_envs_benchmark.py +133 -0
benchmarks/test_llm.py +101 -0
benchmarks/test_non_tensor_env_benchmark.py +70 -0
benchmarks/test_objectives_benchmarks.py +1199 -0
benchmarks/test_replaybuffer_benchmark.py +254 -0
sota-check/README.md +35 -0
sota-implementations/README.md +142 -0
sota-implementations/a2c/README.md +39 -0
sota-implementations/a2c/a2c_atari.py +291 -0
sota-implementations/a2c/a2c_mujoco.py +273 -0
sota-implementations/a2c/utils_atari.py +240 -0
sota-implementations/a2c/utils_mujoco.py +160 -0
sota-implementations/bandits/README.md +7 -0
sota-implementations/bandits/dqn.py +126 -0
sota-implementations/cql/cql_offline.py +198 -0
sota-implementations/cql/cql_online.py +249 -0
sota-implementations/cql/discrete_cql_offline.py +180 -0
sota-implementations/cql/discrete_cql_online.py +227 -0
sota-implementations/cql/utils.py +471 -0
sota-implementations/crossq/crossq.py +271 -0
sota-implementations/crossq/utils.py +320 -0
sota-implementations/ddpg/ddpg.py +231 -0
sota-implementations/ddpg/utils.py +325 -0
sota-implementations/decision_transformer/dt.py +163 -0
sota-implementations/decision_transformer/lamb.py +167 -0
sota-implementations/decision_transformer/online_dt.py +178 -0
sota-implementations/decision_transformer/utils.py +562 -0
sota-implementations/discrete_sac/discrete_sac.py +243 -0
sota-implementations/discrete_sac/utils.py +324 -0
sota-implementations/dqn/README.md +30 -0
sota-implementations/dqn/dqn_atari.py +272 -0
sota-implementations/dqn/dqn_cartpole.py +236 -0
sota-implementations/dqn/utils_atari.py +132 -0
sota-implementations/dqn/utils_cartpole.py +90 -0
sota-implementations/dreamer/README.md +129 -0
sota-implementations/dreamer/dreamer.py +586 -0
sota-implementations/dreamer/dreamer_utils.py +1107 -0
sota-implementations/expert-iteration/README.md +352 -0
sota-implementations/expert-iteration/ei_utils.py +770 -0
sota-implementations/expert-iteration/expert-iteration-async.py +512 -0
sota-implementations/expert-iteration/expert-iteration-sync.py +508 -0
sota-implementations/expert-iteration/requirements_gsm8k.txt +13 -0
sota-implementations/expert-iteration/requirements_ifeval.txt +16 -0
sota-implementations/gail/gail.py +327 -0
sota-implementations/gail/gail_utils.py +68 -0
sota-implementations/gail/ppo_utils.py +157 -0
sota-implementations/grpo/README.md +273 -0
sota-implementations/grpo/grpo-async.py +437 -0
sota-implementations/grpo/grpo-sync.py +435 -0
sota-implementations/grpo/grpo_utils.py +843 -0
sota-implementations/grpo/requirements_gsm8k.txt +11 -0
sota-implementations/grpo/requirements_ifeval.txt +16 -0
sota-implementations/impala/README.md +33 -0
sota-implementations/impala/impala_multi_node_ray.py +292 -0
sota-implementations/impala/impala_multi_node_submitit.py +284 -0
sota-implementations/impala/impala_single_node.py +261 -0
sota-implementations/impala/utils.py +184 -0
sota-implementations/iql/discrete_iql.py +230 -0
sota-implementations/iql/iql_offline.py +164 -0
sota-implementations/iql/iql_online.py +225 -0
sota-implementations/iql/utils.py +437 -0
sota-implementations/multiagent/README.md +74 -0
sota-implementations/multiagent/iql.py +237 -0
sota-implementations/multiagent/maddpg_iddpg.py +266 -0
sota-implementations/multiagent/mappo_ippo.py +267 -0
sota-implementations/multiagent/qmix_vdn.py +271 -0
sota-implementations/multiagent/sac.py +337 -0
sota-implementations/multiagent/utils/__init__.py +4 -0
sota-implementations/multiagent/utils/logging.py +151 -0
sota-implementations/multiagent/utils/utils.py +43 -0
sota-implementations/ppo/README.md +29 -0
sota-implementations/ppo/ppo_atari.py +305 -0
sota-implementations/ppo/ppo_mujoco.py +293 -0
sota-implementations/ppo/utils_atari.py +238 -0
sota-implementations/ppo/utils_mujoco.py +152 -0
sota-implementations/ppo_trainer/train.py +21 -0
sota-implementations/redq/README.md +7 -0
sota-implementations/redq/redq.py +199 -0
sota-implementations/redq/utils.py +1060 -0
sota-implementations/sac/sac-async.py +266 -0
sota-implementations/sac/sac.py +239 -0
sota-implementations/sac/utils.py +381 -0
sota-implementations/sac_trainer/train.py +16 -0
sota-implementations/td3/td3.py +254 -0
sota-implementations/td3/utils.py +319 -0
sota-implementations/td3_bc/td3_bc.py +177 -0
sota-implementations/td3_bc/utils.py +251 -0
torchrl/.dylibs/libc++.1.0.dylib +0 -0
torchrl/__init__.py +144 -0
torchrl/_extension.py +74 -0
torchrl/_torchrl.cpython-314t-darwin.so +0 -0
torchrl/_utils.py +1431 -0
torchrl/collectors/__init__.py +48 -0
torchrl/collectors/_base.py +1058 -0
torchrl/collectors/_constants.py +88 -0
torchrl/collectors/_multi_async.py +324 -0
torchrl/collectors/_multi_base.py +1805 -0
torchrl/collectors/_multi_sync.py +464 -0
torchrl/collectors/_runner.py +581 -0
torchrl/collectors/_single.py +2009 -0
torchrl/collectors/_single_async.py +259 -0
torchrl/collectors/collectors.py +62 -0
torchrl/collectors/distributed/__init__.py +32 -0
torchrl/collectors/distributed/default_configs.py +133 -0
torchrl/collectors/distributed/generic.py +1306 -0
torchrl/collectors/distributed/ray.py +1092 -0
torchrl/collectors/distributed/rpc.py +1006 -0
torchrl/collectors/distributed/sync.py +731 -0
torchrl/collectors/distributed/utils.py +160 -0
torchrl/collectors/llm/__init__.py +10 -0
torchrl/collectors/llm/base.py +494 -0
torchrl/collectors/llm/ray_collector.py +275 -0
torchrl/collectors/llm/utils.py +36 -0
torchrl/collectors/llm/weight_update/__init__.py +10 -0
torchrl/collectors/llm/weight_update/vllm.py +348 -0
torchrl/collectors/llm/weight_update/vllm_v2.py +311 -0
torchrl/collectors/utils.py +433 -0
torchrl/collectors/weight_update.py +591 -0
torchrl/csrc/numpy_utils.h +38 -0
torchrl/csrc/pybind.cpp +27 -0
torchrl/csrc/segment_tree.h +458 -0
torchrl/csrc/torch_utils.h +34 -0
torchrl/csrc/utils.cpp +48 -0
torchrl/csrc/utils.h +31 -0
torchrl/data/__init__.py +187 -0
torchrl/data/datasets/__init__.py +58 -0
torchrl/data/datasets/atari_dqn.py +878 -0
torchrl/data/datasets/common.py +281 -0
torchrl/data/datasets/d4rl.py +489 -0
torchrl/data/datasets/d4rl_infos.py +187 -0
torchrl/data/datasets/gen_dgrl.py +375 -0
torchrl/data/datasets/minari_data.py +643 -0
torchrl/data/datasets/openml.py +177 -0
torchrl/data/datasets/openx.py +798 -0
torchrl/data/datasets/roboset.py +363 -0
torchrl/data/datasets/utils.py +11 -0
torchrl/data/datasets/vd4rl.py +432 -0
torchrl/data/llm/__init__.py +34 -0
torchrl/data/llm/dataset.py +491 -0
torchrl/data/llm/history.py +1378 -0
torchrl/data/llm/prompt.py +198 -0
torchrl/data/llm/reward.py +225 -0
torchrl/data/llm/topk.py +186 -0
torchrl/data/llm/utils.py +543 -0
torchrl/data/map/__init__.py +21 -0
torchrl/data/map/hash.py +185 -0
torchrl/data/map/query.py +204 -0
torchrl/data/map/tdstorage.py +363 -0
torchrl/data/map/tree.py +1434 -0
torchrl/data/map/utils.py +103 -0
torchrl/data/postprocs/__init__.py +8 -0
torchrl/data/postprocs/postprocs.py +391 -0
torchrl/data/replay_buffers/__init__.py +99 -0
torchrl/data/replay_buffers/checkpointers.py +622 -0
torchrl/data/replay_buffers/ray_buffer.py +292 -0
torchrl/data/replay_buffers/replay_buffers.py +2376 -0
torchrl/data/replay_buffers/samplers.py +2578 -0
torchrl/data/replay_buffers/scheduler.py +265 -0
torchrl/data/replay_buffers/storages.py +2412 -0
torchrl/data/replay_buffers/utils.py +1042 -0
torchrl/data/replay_buffers/writers.py +781 -0
torchrl/data/tensor_specs.py +7101 -0
torchrl/data/utils.py +334 -0
torchrl/envs/__init__.py +265 -0
torchrl/envs/async_envs.py +1105 -0
torchrl/envs/batched_envs.py +3093 -0
torchrl/envs/common.py +4241 -0
torchrl/envs/custom/__init__.py +11 -0
torchrl/envs/custom/chess.py +617 -0
torchrl/envs/custom/llm.py +214 -0
torchrl/envs/custom/pendulum.py +401 -0
torchrl/envs/custom/san_moves.txt +29274 -0
torchrl/envs/custom/tictactoeenv.py +288 -0
torchrl/envs/env_creator.py +263 -0
torchrl/envs/gym_like.py +752 -0
torchrl/envs/libs/__init__.py +68 -0
torchrl/envs/libs/_gym_utils.py +326 -0
torchrl/envs/libs/brax.py +846 -0
torchrl/envs/libs/dm_control.py +544 -0
torchrl/envs/libs/envpool.py +447 -0
torchrl/envs/libs/gym.py +2239 -0
torchrl/envs/libs/habitat.py +138 -0
torchrl/envs/libs/isaac_lab.py +87 -0
torchrl/envs/libs/isaacgym.py +203 -0
torchrl/envs/libs/jax_utils.py +166 -0
torchrl/envs/libs/jumanji.py +963 -0
torchrl/envs/libs/meltingpot.py +599 -0
torchrl/envs/libs/openml.py +153 -0
torchrl/envs/libs/openspiel.py +652 -0
torchrl/envs/libs/pettingzoo.py +1042 -0
torchrl/envs/libs/procgen.py +351 -0
torchrl/envs/libs/robohive.py +429 -0
torchrl/envs/libs/smacv2.py +645 -0
torchrl/envs/libs/unity_mlagents.py +891 -0
torchrl/envs/libs/utils.py +147 -0
torchrl/envs/libs/vmas.py +813 -0
torchrl/envs/llm/__init__.py +63 -0
torchrl/envs/llm/chat.py +730 -0
torchrl/envs/llm/datasets/README.md +4 -0
torchrl/envs/llm/datasets/__init__.py +17 -0
torchrl/envs/llm/datasets/gsm8k.py +353 -0
torchrl/envs/llm/datasets/ifeval.py +274 -0
torchrl/envs/llm/envs.py +789 -0
torchrl/envs/llm/libs/README.md +3 -0
torchrl/envs/llm/libs/__init__.py +8 -0
torchrl/envs/llm/libs/mlgym.py +869 -0
torchrl/envs/llm/reward/__init__.py +10 -0
torchrl/envs/llm/reward/gsm8k.py +324 -0
torchrl/envs/llm/reward/ifeval/README.md +13 -0
torchrl/envs/llm/reward/ifeval/__init__.py +10 -0
torchrl/envs/llm/reward/ifeval/_instructions.py +1667 -0
torchrl/envs/llm/reward/ifeval/_instructions_main.py +131 -0
torchrl/envs/llm/reward/ifeval/_instructions_registry.py +100 -0
torchrl/envs/llm/reward/ifeval/_instructions_util.py +1677 -0
torchrl/envs/llm/reward/ifeval/_scorer.py +454 -0
torchrl/envs/llm/transforms/__init__.py +55 -0
torchrl/envs/llm/transforms/browser.py +292 -0
torchrl/envs/llm/transforms/dataloading.py +859 -0
torchrl/envs/llm/transforms/format.py +73 -0
torchrl/envs/llm/transforms/kl.py +1544 -0
torchrl/envs/llm/transforms/policy_version.py +189 -0
torchrl/envs/llm/transforms/reason.py +323 -0
torchrl/envs/llm/transforms/tokenizer.py +321 -0
torchrl/envs/llm/transforms/tools.py +1955 -0
torchrl/envs/model_based/__init__.py +9 -0
torchrl/envs/model_based/common.py +180 -0
torchrl/envs/model_based/dreamer.py +112 -0
torchrl/envs/transforms/__init__.py +147 -0
torchrl/envs/transforms/functional.py +48 -0
torchrl/envs/transforms/gym_transforms.py +203 -0
torchrl/envs/transforms/module.py +341 -0
torchrl/envs/transforms/r3m.py +372 -0
torchrl/envs/transforms/ray_service.py +663 -0
torchrl/envs/transforms/rb_transforms.py +214 -0
torchrl/envs/transforms/transforms.py +11835 -0
torchrl/envs/transforms/utils.py +94 -0
torchrl/envs/transforms/vc1.py +307 -0
torchrl/envs/transforms/vecnorm.py +845 -0
torchrl/envs/transforms/vip.py +407 -0
torchrl/envs/utils.py +1718 -0
torchrl/envs/vec_envs.py +11 -0
torchrl/modules/__init__.py +206 -0
torchrl/modules/distributions/__init__.py +73 -0
torchrl/modules/distributions/continuous.py +830 -0
torchrl/modules/distributions/discrete.py +908 -0
torchrl/modules/distributions/truncated_normal.py +187 -0
torchrl/modules/distributions/utils.py +233 -0
torchrl/modules/llm/__init__.py +62 -0
torchrl/modules/llm/backends/__init__.py +65 -0
torchrl/modules/llm/backends/vllm/__init__.py +94 -0
torchrl/modules/llm/backends/vllm/_models.py +46 -0
torchrl/modules/llm/backends/vllm/base.py +72 -0
torchrl/modules/llm/backends/vllm/vllm_async.py +2075 -0
torchrl/modules/llm/backends/vllm/vllm_plugin.py +22 -0
torchrl/modules/llm/backends/vllm/vllm_sync.py +446 -0
torchrl/modules/llm/backends/vllm/vllm_utils.py +129 -0
torchrl/modules/llm/policies/__init__.py +28 -0
torchrl/modules/llm/policies/common.py +1809 -0
torchrl/modules/llm/policies/transformers_wrapper.py +2756 -0
torchrl/modules/llm/policies/vllm_wrapper.py +2241 -0
torchrl/modules/llm/utils.py +23 -0
torchrl/modules/mcts/__init__.py +21 -0
torchrl/modules/mcts/scores.py +579 -0
torchrl/modules/models/__init__.py +86 -0
torchrl/modules/models/batchrenorm.py +119 -0
torchrl/modules/models/decision_transformer.py +179 -0
torchrl/modules/models/exploration.py +731 -0
torchrl/modules/models/llm.py +156 -0
torchrl/modules/models/model_based.py +596 -0
torchrl/modules/models/models.py +1712 -0
torchrl/modules/models/multiagent.py +1067 -0
torchrl/modules/models/recipes/impala.py +185 -0
torchrl/modules/models/utils.py +162 -0
torchrl/modules/planners/__init__.py +10 -0
torchrl/modules/planners/cem.py +228 -0
torchrl/modules/planners/common.py +73 -0
torchrl/modules/planners/mppi.py +265 -0
torchrl/modules/tensordict_module/__init__.py +89 -0
torchrl/modules/tensordict_module/actors.py +2457 -0
torchrl/modules/tensordict_module/common.py +529 -0
torchrl/modules/tensordict_module/exploration.py +814 -0
torchrl/modules/tensordict_module/probabilistic.py +321 -0
torchrl/modules/tensordict_module/rnn.py +1639 -0
torchrl/modules/tensordict_module/sequence.py +132 -0
torchrl/modules/tensordict_module/world_models.py +34 -0
torchrl/modules/utils/__init__.py +38 -0
torchrl/modules/utils/mappings.py +9 -0
torchrl/modules/utils/utils.py +89 -0
torchrl/objectives/__init__.py +78 -0
torchrl/objectives/a2c.py +659 -0
torchrl/objectives/common.py +753 -0
torchrl/objectives/cql.py +1346 -0
torchrl/objectives/crossq.py +710 -0
torchrl/objectives/ddpg.py +453 -0
torchrl/objectives/decision_transformer.py +371 -0
torchrl/objectives/deprecated.py +516 -0
torchrl/objectives/dqn.py +683 -0
torchrl/objectives/dreamer.py +488 -0
torchrl/objectives/functional.py +48 -0
torchrl/objectives/gail.py +258 -0
torchrl/objectives/iql.py +996 -0
torchrl/objectives/llm/__init__.py +30 -0
torchrl/objectives/llm/grpo.py +846 -0
torchrl/objectives/llm/sft.py +482 -0
torchrl/objectives/multiagent/__init__.py +8 -0
torchrl/objectives/multiagent/qmixer.py +396 -0
torchrl/objectives/ppo.py +1669 -0
torchrl/objectives/redq.py +683 -0
torchrl/objectives/reinforce.py +530 -0
torchrl/objectives/sac.py +1580 -0
torchrl/objectives/td3.py +570 -0
torchrl/objectives/td3_bc.py +625 -0
torchrl/objectives/utils.py +782 -0
torchrl/objectives/value/__init__.py +28 -0
torchrl/objectives/value/advantages.py +1956 -0
torchrl/objectives/value/functional.py +1459 -0
torchrl/objectives/value/utils.py +360 -0
torchrl/record/__init__.py +17 -0
torchrl/record/loggers/__init__.py +23 -0
torchrl/record/loggers/common.py +48 -0
torchrl/record/loggers/csv.py +226 -0
torchrl/record/loggers/mlflow.py +142 -0
torchrl/record/loggers/tensorboard.py +139 -0
torchrl/record/loggers/trackio.py +163 -0
torchrl/record/loggers/utils.py +78 -0
torchrl/record/loggers/wandb.py +214 -0
torchrl/record/recorder.py +554 -0
torchrl/services/__init__.py +79 -0
torchrl/services/base.py +109 -0
torchrl/services/ray_service.py +453 -0
torchrl/testing/__init__.py +107 -0
torchrl/testing/assertions.py +179 -0
torchrl/testing/dist_utils.py +122 -0
torchrl/testing/env_creators.py +227 -0
torchrl/testing/env_helper.py +35 -0
torchrl/testing/gym_helpers.py +156 -0
torchrl/testing/llm_mocks.py +119 -0
torchrl/testing/mocking_classes.py +2720 -0
torchrl/testing/modules.py +295 -0
torchrl/testing/mp_helpers.py +15 -0
torchrl/testing/ray_helpers.py +293 -0
torchrl/testing/utils.py +190 -0
torchrl/trainers/__init__.py +42 -0
torchrl/trainers/algorithms/__init__.py +11 -0
torchrl/trainers/algorithms/configs/__init__.py +705 -0
torchrl/trainers/algorithms/configs/collectors.py +216 -0
torchrl/trainers/algorithms/configs/common.py +41 -0
torchrl/trainers/algorithms/configs/data.py +308 -0
torchrl/trainers/algorithms/configs/envs.py +104 -0
torchrl/trainers/algorithms/configs/envs_libs.py +361 -0
torchrl/trainers/algorithms/configs/logging.py +80 -0
torchrl/trainers/algorithms/configs/modules.py +570 -0
torchrl/trainers/algorithms/configs/objectives.py +177 -0
torchrl/trainers/algorithms/configs/trainers.py +340 -0
torchrl/trainers/algorithms/configs/transforms.py +955 -0
torchrl/trainers/algorithms/configs/utils.py +252 -0
torchrl/trainers/algorithms/configs/weight_sync_schemes.py +191 -0
torchrl/trainers/algorithms/configs/weight_update.py +159 -0
torchrl/trainers/algorithms/ppo.py +373 -0
torchrl/trainers/algorithms/sac.py +308 -0
torchrl/trainers/helpers/__init__.py +40 -0
torchrl/trainers/helpers/collectors.py +416 -0
torchrl/trainers/helpers/envs.py +573 -0
torchrl/trainers/helpers/logger.py +33 -0
torchrl/trainers/helpers/losses.py +132 -0
torchrl/trainers/helpers/models.py +658 -0
torchrl/trainers/helpers/replay_buffer.py +59 -0
torchrl/trainers/helpers/trainers.py +301 -0
torchrl/trainers/trainers.py +2052 -0
torchrl/weight_update/__init__.py +33 -0
torchrl/weight_update/_distributed.py +749 -0
torchrl/weight_update/_mp.py +624 -0
torchrl/weight_update/_noupdate.py +102 -0
torchrl/weight_update/_ray.py +1032 -0
torchrl/weight_update/_rpc.py +284 -0
torchrl/weight_update/_shared.py +891 -0
torchrl/weight_update/llm/__init__.py +32 -0
torchrl/weight_update/llm/vllm_double_buffer.py +370 -0
torchrl/weight_update/llm/vllm_nccl.py +710 -0
torchrl/weight_update/utils.py +73 -0
torchrl/weight_update/weight_sync_schemes.py +1244 -0
torchrl-0.11.0.dist-info/METADATA +1308 -0
torchrl-0.11.0.dist-info/RECORD +395 -0
torchrl-0.11.0.dist-info/WHEEL +5 -0
torchrl-0.11.0.dist-info/entry_points.txt +2 -0
torchrl-0.11.0.dist-info/licenses/LICENSE +21 -0
torchrl-0.11.0.dist-info/top_level.txt +7 -0

sota-implementations/grpo/grpo-async.py ADDED Viewed

@@ -0,0 +1,437 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+import gc
+import os
+import time
+from functools import partial
+from pathlib import Path
+import hydra
+from torchrl import merge_ray_runtime_env, torchrl_logger
+from torchrl.data.llm.history import History
+from torchrl.record.loggers.wandb import WandbLogger
+from torchrl.weight_update.llm import get_model_metadata
+try:
+    import ray
+except ImportError:
+    raise ImportError(
+        "Ray is required for async training. Please install ray with `pip install ray`."
+    )
+import torch
+import tqdm
+from grpo_utils import (
+    add_kl_transforms_to_replay_buffer,
+    check_grpo_dependencies,
+    compute_device_allocation,
+    get_inference_model,
+    get_train_model,
+    log_training_metrics,
+    make_env,
+    make_weight_sync_scheme,
+)
+from omegaconf import DictConfig
+try:
+    from tensordict import set_list_to_stack
+except ImportError:
+    raise ImportError(
+        "TensorDict is required. Please install it with `pip install tensordict`."
+    )
+from torch.amp.autocast_mode import autocast
+from torch.amp.grad_scaler import GradScaler
+from torchrl._utils import timeit
+from torchrl.collectors.llm import RayLLMCollector
+from torchrl.data import LazyStackStorage, ReplayBuffer
+from torchrl.data.replay_buffers.ray_buffer import RayReplayBuffer
+from torchrl.objectives.llm.grpo import GRPOLoss, MCAdvantage
+def setup_environment() -> None:
+    """Setup required environment variables and configurations."""
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required for training")
+    # Set default dtype to float32 for mixed precision training
+    torch.set_default_dtype(torch.float32)
+    torch.set_default_device("cuda:0")
+    set_list_to_stack(True).set()
+    # Ensure CUDA is using the correct dtype
+    if torch.cuda.is_available():
+        torch.cuda.set_device("cuda:0")
+def train(
+    replay_buffer: ReplayBuffer,
+    cfg: DictConfig,
+    collectors: list[RayLLMCollector],
+    inference_policy,
+    devices: list[int] | None = None,
+):
+    """Main training loop for GRPO async.
+    This function implements asynchronous training where data collection and optimization
+    happen concurrently. The total number of steps is determined by the number of epochs,
+    samples per epoch, and batches collected.
+    Args:
+        replay_buffer: The replay buffer to store experiences
+        cfg: The configuration object containing training parameters
+        collectors: The collectors objects.
+        devices: The devices to use for the training model.
+    """
+    # Setup training model and tokenizer
+    policy_training, train_tokenizer = get_train_model(cfg, devices=devices)
+    train_device = torch.device(f"cuda:{devices[0]}" if devices else "cuda:0")
+    # Setup loss function
+    loss_fn = GRPOLoss(
+        actor_network=policy_training,
+        kl_to_ref_coeff=cfg.train.kl_to_ref_coeff
+        if (cfg.train.kl_coef_in_loss and cfg.train.use_kl_to_ref)
+        else 0.0,
+        kl_to_inference_coeff=cfg.train.kl_to_inference_coeff,
+        entropy_coeff=cfg.train.entropy_coeff,
+        masking_strategy="rlhf" if cfg.env.reasoning else "sft",
+        device=train_device,
+    )
+    if cfg.env.reasoning:
+        # TODO: this is clunky, we should find a way to do this more naturally
+        loss_fn.set_keys(sample_log_prob=("next", "log_probs", "full"))
+    if cfg.model.compile:
+        loss_fn = torch.compile(loss_fn)
+    vllm_engine = inference_policy.model
+    # Create weight sync scheme for the collectors
+    weight_sync_scheme = make_weight_sync_scheme(vllm_engine=vllm_engine)
+    # Set up weight sync scheme for collectors
+    # Note: We need to get the sender after the collectors are created
+    # For now, we'll update the collectors to use the scheme
+    torchrl_logger.info("Setting up weight synchronization scheme...")
+    # We'll need to manually set up the sender since collectors were already created
+    # without the scheme. In production, collectors should be created with weight_sync_schemes parameter.
+    sender = weight_sync_scheme.create_sender()
+    sender.register_model(policy_training)
+    # Initialize collective group
+    torchrl_logger.info("Initializing collective group...")
+    metadata = get_model_metadata(policy_training)
+    sender.init_all_workers_group(metadata, vllm_engine=vllm_engine)
+    # First weight update
+    with timeit("update_policy_weights"):
+        sender.update_weights()
+    torchrl_logger.info("Completed first update_policy_weights. Starting collectors...")
+    timeit.print(prefix="First update_policy_weights_ time")
+    timeit.reset()
+    for i, collector in enumerate(collectors):
+        torchrl_logger.info(f"Starting collector {i}...")
+        collector.start()
+    while not replay_buffer.write_count:
+        torchrl_logger.info("Waiting for replay buffer...")
+        time.sleep(1)
+    # Make optimizer
+    optimizer = torch.optim.Adam(
+        policy_training.parameters(),
+        lr=cfg.optimizer.lr,
+        weight_decay=cfg.optimizer.weight_decay,
+        fused=False,
+    )
+    scaler = GradScaler(enabled=cfg.train.mixed_precision)
+    # Make checkpoint dir
+    checkpoint_dir = Path(cfg.logging.checkpoint_dir)
+    checkpoint_dir.mkdir(parents=True, exist_ok=True)
+    # Make wandb logger
+    experiment_name = cfg.logging.experiment_name
+    if experiment_name is not None:
+        experiment_name = [experiment_name]
+    else:
+        experiment_name = []
+    experiment_name.append(cfg.env.dataset)
+    experiment_name.append(cfg.model.name)
+    wandb_logger = WandbLogger(
+        project="grpo-async", exp_name="-".join(["grpo-async"] + experiment_name)
+    )
+    # Training loop
+    total_steps = (
+        -(cfg.train.total_dialog_turns // -cfg.train.optim_batch_size)
+        * cfg.train.epochs
+    )
+    torchrl_logger.info(f"Total steps: {total_steps}")
+    pbar = tqdm.tqdm(total=total_steps)
+    grad_norm = 0.0  # Initialize grad_norm
+    data_read_count = 0
+    start_time = time.time()
+    for step in range(total_steps):
+        if not any(collector.is_running() for collector in collectors):
+            torchrl_logger.info("Collectors stopped, stopping training")
+            break
+        pbar.update(1)
+        pbar.set_description(f"Step {step}, writes: {replay_buffer.write_count}")
+        with timeit("sampling"):
+            # Sample the correct batch size for gradient accumulation
+            # The replay buffer is configured with batch_size = optim_batch_size // gradient_accumulation_steps
+            # So we should sample that amount per step, not the full optim_batch_size
+            batch_size_per_step = (
+                cfg.train.optim_batch_size // cfg.train.gradient_accumulation_steps
+            )
+            batch = replay_buffer.sample(batch_size_per_step).to(train_device)
+            history: History = batch.view(-1)[0]["history", "full"]
+            history_str: list[str] | str = history.apply_chat_template(
+                tokenizer=train_tokenizer
+            )
+            while not isinstance(history_str, str):
+                history_str = "\n".join(history_str)
+            data_read_count += batch.numel()
+        with timeit("forward_pass"):
+            with autocast("cuda", enabled=cfg.train.mixed_precision):
+                loss = loss_fn(batch)
+                loss_val = (
+                    loss.mean(reduce=True) / cfg.train.gradient_accumulation_steps
+                )
+        with timeit("backward_pass"):
+            if cfg.train.mixed_precision and cfg.train_model.torch_dtype == "float16":
+                scaler = GradScaler(enabled=True)
+                scaler.scale(loss_val).backward()
+            else:
+                loss_val.backward()
+        if (step + 1) % cfg.train.gradient_accumulation_steps == 0:
+            with timeit("optim_step"):
+                if (
+                    cfg.train.mixed_precision
+                    and cfg.train_model.torch_dtype == "float16"
+                ):
+                    scaler.unscale_(optimizer)
+                grad_norm = torch.nn.utils.clip_grad_norm_(
+                    policy_training.parameters(),
+                    cfg.optimizer.clip_grad_norm,
+                )
+                if (
+                    cfg.train.mixed_precision
+                    and cfg.train_model.torch_dtype == "float16"
+                ):
+                    scaler.step(optimizer)
+                    scaler.update()
+                else:
+                    optimizer.step()
+                optimizer.zero_grad(set_to_none=True)
+        if (step % cfg.train.logging_frequency) == 0:
+            log_training_metrics(
+                wandb_logger=wandb_logger,
+                replay_buffer=replay_buffer,
+                batch=batch,
+                loss=loss,
+                grad_norm=grad_norm,
+                global_step=step,
+                data_read_count=data_read_count,
+                collector=collectors[0],
+                start_time=start_time,
+                gradient_accumulation_steps=cfg.train.gradient_accumulation_steps,
+                history_str=history_str,
+                use_kl_to_ref=cfg.train.use_kl_to_ref,
+            )
+        if step % cfg.train.weight_update_frequency == 0:
+            with timeit("update_policy_weights"):
+                torchrl_logger.info("Updating policy weights...")
+                sender.update_weights()
+                # TODO: do we need this? Does it interfere with other processes?
+                # torch.cuda.empty_cache()
+                gc.collect()
+        # Checkpointing disabled to prevent disk space issues
+        # if (step + 1) % cfg.train.checkpoint_frequency == 0:
+        #     with timeit("save_checkpoint"):
+        #         torchrl_logger.info(
+        #             f"Saving checkpoint {(step+1) // cfg.train.checkpoint_frequency}..."
+        #         )
+        #         checkpoint = {
+        #             "step": step,
+        #             "model_state_dict": policy_training.model.state_dict(),
+        #             "optimizer_state_dict": optimizer.state_dict(),
+        #             "scaler_state_dict": scaler.state_dict(),
+        #             "config": dict(cfg),
+        #         }
+        #         torch.save(checkpoint, checkpoint_dir / f"checkpoint_{step:04d}.pt")
+        if step % cfg.train.weight_update_frequency == 0:
+            timeit.print(prefix="timeit")
+            for key, val in timeit.todict().items():
+                wandb_logger.log_scalar(f"timeit/{key}", val)
+            timeit.reset()
+        del loss_val
+        # TODO: do we need this? Does it interfere with other processes?
+        # torch.cuda.empty_cache()
+        gc.collect()
+    pbar.close()
+    collector.shutdown()
+@hydra.main(version_base=None, config_path="config", config_name="grpo_gsm8k")
+def main(cfg):
+    # Check for required GRPO dependencies
+    check_grpo_dependencies()
+    # Force async mode
+    if cfg.train.sync:
+        raise ValueError(
+            "grpo-async.py must run in async mode (`python grpo-async.py mode=async`). Please use grpo-sync.py for sync mode (`python grpo-sync.py mode=sync`)."
+        )
+    # Compute device allocation
+    device_config = compute_device_allocation(cfg)
+    if not ray.is_initialized():
+        # Convert OmegaConf to regular dict and filter out unsupported parameters
+        ray_init_config = {
+            k: dict(v) if isinstance(v, DictConfig) else v
+            for k, v in dict(cfg.ray.init_config).items()
+            if not k.startswith("_")
+        }
+        # Add computed GPU configuration and merge with default runtime_env
+        ray_init_config["num_gpus"] = device_config["ray_num_gpus"]
+        ray_init_config = merge_ray_runtime_env(ray_init_config)
+        torchrl_logger.info(f"Ray init config: {ray_init_config=}")
+        ray_managed_externally = os.environ.get("RAY_CLUSTER_MANAGED_EXTERNALLY")
+        if ray_managed_externally:
+            ray.init(address="auto")
+        else:
+            ray.init(**ray_init_config)
+    # Check if num_devices is set
+    if cfg.inference_model.num_devices is None:
+        raise ValueError(
+            "Inference model num_devices must be set via inference_model.num_devices"
+        )
+    if cfg.train.use_kl_to_ref and cfg.ref_model.num_devices is None:
+        raise ValueError(
+            "Ref model num_devices must be set via ref_model.num_devices when use_kl_to_ref is True"
+        )
+    if cfg.train_model.num_devices is None:
+        raise ValueError(
+            "Train model num_devices must be set via train_model.num_devices"
+        )
+    # Convert OmegaConf to regular dict for Ray configs
+    replay_buffer_config = dict(cfg.ray.replay_buffer_config)
+    collector_config = dict(cfg.ray.collector_config)
+    train_handler_config = dict(cfg.ray.train_handler_config)
+    inference_policy = get_inference_model(
+        cfg,
+        devices=device_config["inference_model_devices"],
+    )
+    torchrl_logger.info(f"Inference policy: {inference_policy}")
+    torchrl_logger.info(f"Starting replay buffer with {replay_buffer_config=}")
+    if cfg.train.optim_batch_size % cfg.train.gradient_accumulation_steps != 0:
+        raise ValueError(
+            "optim_batch_size must be divisible by gradient_accumulation_steps"
+        )
+    rb = RayReplayBuffer(
+        storage=partial(
+            LazyStackStorage,
+            cfg.train.buffer_size
+            if cfg.train.buffer_size
+            else cfg.env.repeats * cfg.env.num_envs,
+        ),
+        transform_factory=partial(MCAdvantage, grpo_size=cfg.env.repeats),
+        batch_size=max(
+            1, cfg.train.optim_batch_size // cfg.train.gradient_accumulation_steps
+        ),
+        remote_config=replay_buffer_config,
+    )
+    add_kl_transforms_to_replay_buffer(rb, cfg)
+    torchrl_logger.info(f"Replay buffer: {rb}")
+    collector_config["num_gpus"] = 0
+    collector_config["num_cpus"] = 2
+    torchrl_logger.info(f"Starting collector with {collector_config=}")
+    if cfg.train.sync_iter is not None:
+        raise ValueError("sync_iter is not supported in async mode.")
+    collectors = []
+    for i in tqdm.trange(cfg.env.num_envs, desc="Starting collectors"):
+        collector = RayLLMCollector(
+            env=partial(make_env, cfg, single_env=True),
+            policy=inference_policy,
+            dialog_turns_per_batch=cfg.train.dialog_turns_per_batch,
+            total_dialog_turns=cfg.train.total_dialog_turns,
+            replay_buffer=rb,
+            ray_init_config=None,
+            weight_updater=None,
+            track_policy_version=True,
+            remote_config=collector_config,
+            yield_only_last_steps=cfg.env.reasoning,
+            verbose=False,
+        )
+        collectors.append(collector)
+        if i == 0:
+            # wait for the first collector to initialize
+            ray.get(collector._collector.is_initialized.remote())
+    inits = []
+    for collector in tqdm.tqdm(
+        collectors[1:], desc="Checking collector initialization"
+    ):
+        inits.append(collector._collector.is_initialized.remote())
+    ray.get(inits)
+    torchrl_logger.info("All collectors initialized")
+    train_handler_config = {
+        "num_cpus": train_handler_config.get("num_cpus", 1),
+        "num_gpus": cfg.train_model.num_devices,
+    }
+    torchrl_logger.info(f"Starting training handler with {train_handler_config=}")
+    train_handler = ray.remote(
+        **train_handler_config,
+    )(train)
+    # launch training
+    ray.get(
+        train_handler.remote(
+            rb,
+            cfg,
+            collectors,
+            inference_policy,
+            devices=device_config["train_model_devices"],
+        )
+    )
+if __name__ == "__main__":
+    # Setup environment
+    setup_environment()
+    main()