PyPI - torchrl - Versions diffs - 0.11.0__cp314-cp314t-win_amd64.whl - Mend

torchrl 0.11.0__cp314-cp314t-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (394) hide show

benchmarks/benchmark_batched_envs.py +104 -0
benchmarks/conftest.py +91 -0
benchmarks/ecosystem/gym_env_throughput.py +321 -0
benchmarks/ecosystem/vmas_rllib_vs_torchrl_sampling_performance.py +231 -0
benchmarks/requirements.txt +7 -0
benchmarks/storage/benchmark_sample_latency_over_rpc.py +193 -0
benchmarks/test_collectors_benchmark.py +240 -0
benchmarks/test_compressed_storage_benchmark.py +145 -0
benchmarks/test_envs_benchmark.py +133 -0
benchmarks/test_llm.py +101 -0
benchmarks/test_non_tensor_env_benchmark.py +70 -0
benchmarks/test_objectives_benchmarks.py +1199 -0
benchmarks/test_replaybuffer_benchmark.py +254 -0
sota-check/README.md +35 -0
sota-implementations/README.md +142 -0
sota-implementations/a2c/README.md +39 -0
sota-implementations/a2c/a2c_atari.py +291 -0
sota-implementations/a2c/a2c_mujoco.py +273 -0
sota-implementations/a2c/utils_atari.py +240 -0
sota-implementations/a2c/utils_mujoco.py +160 -0
sota-implementations/bandits/README.md +7 -0
sota-implementations/bandits/dqn.py +126 -0
sota-implementations/cql/cql_offline.py +198 -0
sota-implementations/cql/cql_online.py +249 -0
sota-implementations/cql/discrete_cql_offline.py +180 -0
sota-implementations/cql/discrete_cql_online.py +227 -0
sota-implementations/cql/utils.py +471 -0
sota-implementations/crossq/crossq.py +271 -0
sota-implementations/crossq/utils.py +320 -0
sota-implementations/ddpg/ddpg.py +231 -0
sota-implementations/ddpg/utils.py +325 -0
sota-implementations/decision_transformer/dt.py +163 -0
sota-implementations/decision_transformer/lamb.py +167 -0
sota-implementations/decision_transformer/online_dt.py +178 -0
sota-implementations/decision_transformer/utils.py +562 -0
sota-implementations/discrete_sac/discrete_sac.py +243 -0
sota-implementations/discrete_sac/utils.py +324 -0
sota-implementations/dqn/README.md +30 -0
sota-implementations/dqn/dqn_atari.py +272 -0
sota-implementations/dqn/dqn_cartpole.py +236 -0
sota-implementations/dqn/utils_atari.py +132 -0
sota-implementations/dqn/utils_cartpole.py +90 -0
sota-implementations/dreamer/README.md +129 -0
sota-implementations/dreamer/dreamer.py +586 -0
sota-implementations/dreamer/dreamer_utils.py +1107 -0
sota-implementations/expert-iteration/README.md +352 -0
sota-implementations/expert-iteration/ei_utils.py +770 -0
sota-implementations/expert-iteration/expert-iteration-async.py +512 -0
sota-implementations/expert-iteration/expert-iteration-sync.py +508 -0
sota-implementations/expert-iteration/requirements_gsm8k.txt +13 -0
sota-implementations/expert-iteration/requirements_ifeval.txt +16 -0
sota-implementations/gail/gail.py +327 -0
sota-implementations/gail/gail_utils.py +68 -0
sota-implementations/gail/ppo_utils.py +157 -0
sota-implementations/grpo/README.md +273 -0
sota-implementations/grpo/grpo-async.py +437 -0
sota-implementations/grpo/grpo-sync.py +435 -0
sota-implementations/grpo/grpo_utils.py +843 -0
sota-implementations/grpo/requirements_gsm8k.txt +11 -0
sota-implementations/grpo/requirements_ifeval.txt +16 -0
sota-implementations/impala/README.md +33 -0
sota-implementations/impala/impala_multi_node_ray.py +292 -0
sota-implementations/impala/impala_multi_node_submitit.py +284 -0
sota-implementations/impala/impala_single_node.py +261 -0
sota-implementations/impala/utils.py +184 -0
sota-implementations/iql/discrete_iql.py +230 -0
sota-implementations/iql/iql_offline.py +164 -0
sota-implementations/iql/iql_online.py +225 -0
sota-implementations/iql/utils.py +437 -0
sota-implementations/multiagent/README.md +74 -0
sota-implementations/multiagent/iql.py +237 -0
sota-implementations/multiagent/maddpg_iddpg.py +266 -0
sota-implementations/multiagent/mappo_ippo.py +267 -0
sota-implementations/multiagent/qmix_vdn.py +271 -0
sota-implementations/multiagent/sac.py +337 -0
sota-implementations/multiagent/utils/__init__.py +4 -0
sota-implementations/multiagent/utils/logging.py +151 -0
sota-implementations/multiagent/utils/utils.py +43 -0
sota-implementations/ppo/README.md +29 -0
sota-implementations/ppo/ppo_atari.py +305 -0
sota-implementations/ppo/ppo_mujoco.py +293 -0
sota-implementations/ppo/utils_atari.py +238 -0
sota-implementations/ppo/utils_mujoco.py +152 -0
sota-implementations/ppo_trainer/train.py +21 -0
sota-implementations/redq/README.md +7 -0
sota-implementations/redq/redq.py +199 -0
sota-implementations/redq/utils.py +1060 -0
sota-implementations/sac/sac-async.py +266 -0
sota-implementations/sac/sac.py +239 -0
sota-implementations/sac/utils.py +381 -0
sota-implementations/sac_trainer/train.py +16 -0
sota-implementations/td3/td3.py +254 -0
sota-implementations/td3/utils.py +319 -0
sota-implementations/td3_bc/td3_bc.py +177 -0
sota-implementations/td3_bc/utils.py +251 -0
torchrl/__init__.py +144 -0
torchrl/_extension.py +74 -0
torchrl/_torchrl.cp314t-win_amd64.pyd +0 -0
torchrl/_utils.py +1431 -0
torchrl/collectors/__init__.py +48 -0
torchrl/collectors/_base.py +1058 -0
torchrl/collectors/_constants.py +88 -0
torchrl/collectors/_multi_async.py +324 -0
torchrl/collectors/_multi_base.py +1805 -0
torchrl/collectors/_multi_sync.py +464 -0
torchrl/collectors/_runner.py +581 -0
torchrl/collectors/_single.py +2009 -0
torchrl/collectors/_single_async.py +259 -0
torchrl/collectors/collectors.py +62 -0
torchrl/collectors/distributed/__init__.py +32 -0
torchrl/collectors/distributed/default_configs.py +133 -0
torchrl/collectors/distributed/generic.py +1306 -0
torchrl/collectors/distributed/ray.py +1092 -0
torchrl/collectors/distributed/rpc.py +1006 -0
torchrl/collectors/distributed/sync.py +731 -0
torchrl/collectors/distributed/utils.py +160 -0
torchrl/collectors/llm/__init__.py +10 -0
torchrl/collectors/llm/base.py +494 -0
torchrl/collectors/llm/ray_collector.py +275 -0
torchrl/collectors/llm/utils.py +36 -0
torchrl/collectors/llm/weight_update/__init__.py +10 -0
torchrl/collectors/llm/weight_update/vllm.py +348 -0
torchrl/collectors/llm/weight_update/vllm_v2.py +311 -0
torchrl/collectors/utils.py +433 -0
torchrl/collectors/weight_update.py +591 -0
torchrl/csrc/numpy_utils.h +38 -0
torchrl/csrc/pybind.cpp +27 -0
torchrl/csrc/segment_tree.h +458 -0
torchrl/csrc/torch_utils.h +34 -0
torchrl/csrc/utils.cpp +48 -0
torchrl/csrc/utils.h +31 -0
torchrl/data/__init__.py +187 -0
torchrl/data/datasets/__init__.py +58 -0
torchrl/data/datasets/atari_dqn.py +878 -0
torchrl/data/datasets/common.py +281 -0
torchrl/data/datasets/d4rl.py +489 -0
torchrl/data/datasets/d4rl_infos.py +187 -0
torchrl/data/datasets/gen_dgrl.py +375 -0
torchrl/data/datasets/minari_data.py +643 -0
torchrl/data/datasets/openml.py +177 -0
torchrl/data/datasets/openx.py +798 -0
torchrl/data/datasets/roboset.py +363 -0
torchrl/data/datasets/utils.py +11 -0
torchrl/data/datasets/vd4rl.py +432 -0
torchrl/data/llm/__init__.py +34 -0
torchrl/data/llm/dataset.py +491 -0
torchrl/data/llm/history.py +1378 -0
torchrl/data/llm/prompt.py +198 -0
torchrl/data/llm/reward.py +225 -0
torchrl/data/llm/topk.py +186 -0
torchrl/data/llm/utils.py +543 -0
torchrl/data/map/__init__.py +21 -0
torchrl/data/map/hash.py +185 -0
torchrl/data/map/query.py +204 -0
torchrl/data/map/tdstorage.py +363 -0
torchrl/data/map/tree.py +1434 -0
torchrl/data/map/utils.py +103 -0
torchrl/data/postprocs/__init__.py +8 -0
torchrl/data/postprocs/postprocs.py +391 -0
torchrl/data/replay_buffers/__init__.py +99 -0
torchrl/data/replay_buffers/checkpointers.py +622 -0
torchrl/data/replay_buffers/ray_buffer.py +292 -0
torchrl/data/replay_buffers/replay_buffers.py +2376 -0
torchrl/data/replay_buffers/samplers.py +2578 -0
torchrl/data/replay_buffers/scheduler.py +265 -0
torchrl/data/replay_buffers/storages.py +2412 -0
torchrl/data/replay_buffers/utils.py +1042 -0
torchrl/data/replay_buffers/writers.py +781 -0
torchrl/data/tensor_specs.py +7101 -0
torchrl/data/utils.py +334 -0
torchrl/envs/__init__.py +265 -0
torchrl/envs/async_envs.py +1105 -0
torchrl/envs/batched_envs.py +3093 -0
torchrl/envs/common.py +4241 -0
torchrl/envs/custom/__init__.py +11 -0
torchrl/envs/custom/chess.py +617 -0
torchrl/envs/custom/llm.py +214 -0
torchrl/envs/custom/pendulum.py +401 -0
torchrl/envs/custom/san_moves.txt +29274 -0
torchrl/envs/custom/tictactoeenv.py +288 -0
torchrl/envs/env_creator.py +263 -0
torchrl/envs/gym_like.py +752 -0
torchrl/envs/libs/__init__.py +68 -0
torchrl/envs/libs/_gym_utils.py +326 -0
torchrl/envs/libs/brax.py +846 -0
torchrl/envs/libs/dm_control.py +544 -0
torchrl/envs/libs/envpool.py +447 -0
torchrl/envs/libs/gym.py +2239 -0
torchrl/envs/libs/habitat.py +138 -0
torchrl/envs/libs/isaac_lab.py +87 -0
torchrl/envs/libs/isaacgym.py +203 -0
torchrl/envs/libs/jax_utils.py +166 -0
torchrl/envs/libs/jumanji.py +963 -0
torchrl/envs/libs/meltingpot.py +599 -0
torchrl/envs/libs/openml.py +153 -0
torchrl/envs/libs/openspiel.py +652 -0
torchrl/envs/libs/pettingzoo.py +1042 -0
torchrl/envs/libs/procgen.py +351 -0
torchrl/envs/libs/robohive.py +429 -0
torchrl/envs/libs/smacv2.py +645 -0
torchrl/envs/libs/unity_mlagents.py +891 -0
torchrl/envs/libs/utils.py +147 -0
torchrl/envs/libs/vmas.py +813 -0
torchrl/envs/llm/__init__.py +63 -0
torchrl/envs/llm/chat.py +730 -0
torchrl/envs/llm/datasets/README.md +4 -0
torchrl/envs/llm/datasets/__init__.py +17 -0
torchrl/envs/llm/datasets/gsm8k.py +353 -0
torchrl/envs/llm/datasets/ifeval.py +274 -0
torchrl/envs/llm/envs.py +789 -0
torchrl/envs/llm/libs/README.md +3 -0
torchrl/envs/llm/libs/__init__.py +8 -0
torchrl/envs/llm/libs/mlgym.py +869 -0
torchrl/envs/llm/reward/__init__.py +10 -0
torchrl/envs/llm/reward/gsm8k.py +324 -0
torchrl/envs/llm/reward/ifeval/README.md +13 -0
torchrl/envs/llm/reward/ifeval/__init__.py +10 -0
torchrl/envs/llm/reward/ifeval/_instructions.py +1667 -0
torchrl/envs/llm/reward/ifeval/_instructions_main.py +131 -0
torchrl/envs/llm/reward/ifeval/_instructions_registry.py +100 -0
torchrl/envs/llm/reward/ifeval/_instructions_util.py +1677 -0
torchrl/envs/llm/reward/ifeval/_scorer.py +454 -0
torchrl/envs/llm/transforms/__init__.py +55 -0
torchrl/envs/llm/transforms/browser.py +292 -0
torchrl/envs/llm/transforms/dataloading.py +859 -0
torchrl/envs/llm/transforms/format.py +73 -0
torchrl/envs/llm/transforms/kl.py +1544 -0
torchrl/envs/llm/transforms/policy_version.py +189 -0
torchrl/envs/llm/transforms/reason.py +323 -0
torchrl/envs/llm/transforms/tokenizer.py +321 -0
torchrl/envs/llm/transforms/tools.py +1955 -0
torchrl/envs/model_based/__init__.py +9 -0
torchrl/envs/model_based/common.py +180 -0
torchrl/envs/model_based/dreamer.py +112 -0
torchrl/envs/transforms/__init__.py +147 -0
torchrl/envs/transforms/functional.py +48 -0
torchrl/envs/transforms/gym_transforms.py +203 -0
torchrl/envs/transforms/module.py +341 -0
torchrl/envs/transforms/r3m.py +372 -0
torchrl/envs/transforms/ray_service.py +663 -0
torchrl/envs/transforms/rb_transforms.py +214 -0
torchrl/envs/transforms/transforms.py +11835 -0
torchrl/envs/transforms/utils.py +94 -0
torchrl/envs/transforms/vc1.py +307 -0
torchrl/envs/transforms/vecnorm.py +845 -0
torchrl/envs/transforms/vip.py +407 -0
torchrl/envs/utils.py +1718 -0
torchrl/envs/vec_envs.py +11 -0
torchrl/modules/__init__.py +206 -0
torchrl/modules/distributions/__init__.py +73 -0
torchrl/modules/distributions/continuous.py +830 -0
torchrl/modules/distributions/discrete.py +908 -0
torchrl/modules/distributions/truncated_normal.py +187 -0
torchrl/modules/distributions/utils.py +233 -0
torchrl/modules/llm/__init__.py +62 -0
torchrl/modules/llm/backends/__init__.py +65 -0
torchrl/modules/llm/backends/vllm/__init__.py +94 -0
torchrl/modules/llm/backends/vllm/_models.py +46 -0
torchrl/modules/llm/backends/vllm/base.py +72 -0
torchrl/modules/llm/backends/vllm/vllm_async.py +2075 -0
torchrl/modules/llm/backends/vllm/vllm_plugin.py +22 -0
torchrl/modules/llm/backends/vllm/vllm_sync.py +446 -0
torchrl/modules/llm/backends/vllm/vllm_utils.py +129 -0
torchrl/modules/llm/policies/__init__.py +28 -0
torchrl/modules/llm/policies/common.py +1809 -0
torchrl/modules/llm/policies/transformers_wrapper.py +2756 -0
torchrl/modules/llm/policies/vllm_wrapper.py +2241 -0
torchrl/modules/llm/utils.py +23 -0
torchrl/modules/mcts/__init__.py +21 -0
torchrl/modules/mcts/scores.py +579 -0
torchrl/modules/models/__init__.py +86 -0
torchrl/modules/models/batchrenorm.py +119 -0
torchrl/modules/models/decision_transformer.py +179 -0
torchrl/modules/models/exploration.py +731 -0
torchrl/modules/models/llm.py +156 -0
torchrl/modules/models/model_based.py +596 -0
torchrl/modules/models/models.py +1712 -0
torchrl/modules/models/multiagent.py +1067 -0
torchrl/modules/models/recipes/impala.py +185 -0
torchrl/modules/models/utils.py +162 -0
torchrl/modules/planners/__init__.py +10 -0
torchrl/modules/planners/cem.py +228 -0
torchrl/modules/planners/common.py +73 -0
torchrl/modules/planners/mppi.py +265 -0
torchrl/modules/tensordict_module/__init__.py +89 -0
torchrl/modules/tensordict_module/actors.py +2457 -0
torchrl/modules/tensordict_module/common.py +529 -0
torchrl/modules/tensordict_module/exploration.py +814 -0
torchrl/modules/tensordict_module/probabilistic.py +321 -0
torchrl/modules/tensordict_module/rnn.py +1639 -0
torchrl/modules/tensordict_module/sequence.py +132 -0
torchrl/modules/tensordict_module/world_models.py +34 -0
torchrl/modules/utils/__init__.py +38 -0
torchrl/modules/utils/mappings.py +9 -0
torchrl/modules/utils/utils.py +89 -0
torchrl/objectives/__init__.py +78 -0
torchrl/objectives/a2c.py +659 -0
torchrl/objectives/common.py +753 -0
torchrl/objectives/cql.py +1346 -0
torchrl/objectives/crossq.py +710 -0
torchrl/objectives/ddpg.py +453 -0
torchrl/objectives/decision_transformer.py +371 -0
torchrl/objectives/deprecated.py +516 -0
torchrl/objectives/dqn.py +683 -0
torchrl/objectives/dreamer.py +488 -0
torchrl/objectives/functional.py +48 -0
torchrl/objectives/gail.py +258 -0
torchrl/objectives/iql.py +996 -0
torchrl/objectives/llm/__init__.py +30 -0
torchrl/objectives/llm/grpo.py +846 -0
torchrl/objectives/llm/sft.py +482 -0
torchrl/objectives/multiagent/__init__.py +8 -0
torchrl/objectives/multiagent/qmixer.py +396 -0
torchrl/objectives/ppo.py +1669 -0
torchrl/objectives/redq.py +683 -0
torchrl/objectives/reinforce.py +530 -0
torchrl/objectives/sac.py +1580 -0
torchrl/objectives/td3.py +570 -0
torchrl/objectives/td3_bc.py +625 -0
torchrl/objectives/utils.py +782 -0
torchrl/objectives/value/__init__.py +28 -0
torchrl/objectives/value/advantages.py +1956 -0
torchrl/objectives/value/functional.py +1459 -0
torchrl/objectives/value/utils.py +360 -0
torchrl/record/__init__.py +17 -0
torchrl/record/loggers/__init__.py +23 -0
torchrl/record/loggers/common.py +48 -0
torchrl/record/loggers/csv.py +226 -0
torchrl/record/loggers/mlflow.py +142 -0
torchrl/record/loggers/tensorboard.py +139 -0
torchrl/record/loggers/trackio.py +163 -0
torchrl/record/loggers/utils.py +78 -0
torchrl/record/loggers/wandb.py +214 -0
torchrl/record/recorder.py +554 -0
torchrl/services/__init__.py +79 -0
torchrl/services/base.py +109 -0
torchrl/services/ray_service.py +453 -0
torchrl/testing/__init__.py +107 -0
torchrl/testing/assertions.py +179 -0
torchrl/testing/dist_utils.py +122 -0
torchrl/testing/env_creators.py +227 -0
torchrl/testing/env_helper.py +35 -0
torchrl/testing/gym_helpers.py +156 -0
torchrl/testing/llm_mocks.py +119 -0
torchrl/testing/mocking_classes.py +2720 -0
torchrl/testing/modules.py +295 -0
torchrl/testing/mp_helpers.py +15 -0
torchrl/testing/ray_helpers.py +293 -0
torchrl/testing/utils.py +190 -0
torchrl/trainers/__init__.py +42 -0
torchrl/trainers/algorithms/__init__.py +11 -0
torchrl/trainers/algorithms/configs/__init__.py +705 -0
torchrl/trainers/algorithms/configs/collectors.py +216 -0
torchrl/trainers/algorithms/configs/common.py +41 -0
torchrl/trainers/algorithms/configs/data.py +308 -0
torchrl/trainers/algorithms/configs/envs.py +104 -0
torchrl/trainers/algorithms/configs/envs_libs.py +361 -0
torchrl/trainers/algorithms/configs/logging.py +80 -0
torchrl/trainers/algorithms/configs/modules.py +570 -0
torchrl/trainers/algorithms/configs/objectives.py +177 -0
torchrl/trainers/algorithms/configs/trainers.py +340 -0
torchrl/trainers/algorithms/configs/transforms.py +955 -0
torchrl/trainers/algorithms/configs/utils.py +252 -0
torchrl/trainers/algorithms/configs/weight_sync_schemes.py +191 -0
torchrl/trainers/algorithms/configs/weight_update.py +159 -0
torchrl/trainers/algorithms/ppo.py +373 -0
torchrl/trainers/algorithms/sac.py +308 -0
torchrl/trainers/helpers/__init__.py +40 -0
torchrl/trainers/helpers/collectors.py +416 -0
torchrl/trainers/helpers/envs.py +573 -0
torchrl/trainers/helpers/logger.py +33 -0
torchrl/trainers/helpers/losses.py +132 -0
torchrl/trainers/helpers/models.py +658 -0
torchrl/trainers/helpers/replay_buffer.py +59 -0
torchrl/trainers/helpers/trainers.py +301 -0
torchrl/trainers/trainers.py +2052 -0
torchrl/weight_update/__init__.py +33 -0
torchrl/weight_update/_distributed.py +749 -0
torchrl/weight_update/_mp.py +624 -0
torchrl/weight_update/_noupdate.py +102 -0
torchrl/weight_update/_ray.py +1032 -0
torchrl/weight_update/_rpc.py +284 -0
torchrl/weight_update/_shared.py +891 -0
torchrl/weight_update/llm/__init__.py +32 -0
torchrl/weight_update/llm/vllm_double_buffer.py +370 -0
torchrl/weight_update/llm/vllm_nccl.py +710 -0
torchrl/weight_update/utils.py +73 -0
torchrl/weight_update/weight_sync_schemes.py +1244 -0
torchrl-0.11.0.dist-info/LICENSE +21 -0
torchrl-0.11.0.dist-info/METADATA +1307 -0
torchrl-0.11.0.dist-info/RECORD +394 -0
torchrl-0.11.0.dist-info/WHEEL +5 -0
torchrl-0.11.0.dist-info/entry_points.txt +2 -0
torchrl-0.11.0.dist-info/top_level.txt +7 -0

sota-implementations/expert-iteration/expert-iteration-async.py ADDED Viewed

@@ -0,0 +1,512 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+import gc
+import time
+from functools import partial
+from pathlib import Path
+import hydra
+from torchrl import merge_ray_runtime_env, torchrl_logger
+from torchrl.data.llm.history import History
+from torchrl.record.loggers.wandb import WandbLogger
+from torchrl.weight_update.llm import get_model_metadata
+try:
+    import ray
+except ImportError:
+    raise ImportError(
+        "Ray is required for async training. Please install ray with `pip install ray`."
+    )
+import torch
+import tqdm
+from ei_utils import (
+    compute_device_allocation,
+    create_cosine_scheduler_with_warmup,
+    get_inference_model,
+    get_train_model,
+    log_training_metrics,
+    make_env,
+    make_weight_sync_scheme,
+    RemoteDataLogger,
+)
+from omegaconf import DictConfig
+from ray.util.queue import Queue
+try:
+    from tensordict import set_list_to_stack
+except ImportError:
+    raise ImportError(
+        "TensorDict is required. Please install it with `pip install tensordict`."
+    )
+from torch.amp.autocast_mode import autocast
+from torch.amp.grad_scaler import GradScaler
+from torchrl._utils import timeit
+from torchrl.collectors.llm import RayLLMCollector
+from torchrl.data import (
+    LazyStackStorage,
+    PrioritizedSampler,
+    ReplayBuffer,
+    TensorDictReplayBuffer,
+)
+from torchrl.data.llm.topk import TopKRewardSelector
+from torchrl.data.replay_buffers.ray_buffer import RayReplayBuffer
+from torchrl.objectives.llm.sft import SFTLoss
+def setup_environment() -> None:
+    """Setup required environment variables and configurations."""
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required for training")
+    # Set default dtype to float32 for mixed precision training
+    torch.set_default_dtype(torch.float32)
+    torch.set_default_device("cuda:0")
+    set_list_to_stack(True).set()
+    # Ensure CUDA is using the correct dtype
+    if torch.cuda.is_available():
+        torch.cuda.set_device("cuda:0")
+def train(
+    replay_buffer: ReplayBuffer,
+    cfg: DictConfig,
+    collector: RayLLMCollector,
+    devices: list[int] | None = None,
+):
+    """Main training loop for EI async.
+    This function implements asynchronous training where data collection and optimization
+    happen concurrently. The total number of steps is determined by the number of epochs,
+    samples per epoch, and batches collected.
+    Args:
+        replay_buffer: The replay buffer to store experiences
+        cfg: The configuration object containing training parameters
+        collector: The collector object.
+        devices: The devices to use for the training model.
+    """
+    # Setup training model and tokenizer
+    policy_training, train_tokenizer = get_train_model(
+        cfg, devices=devices, chat_template_name="qwen"
+    )
+    train_device = devices[0]  # Use first device for batch processing
+    # Setup loss function
+    loss_fn = SFTLoss(
+        actor_network=policy_training,
+        kl_to_ref_coeff=cfg.train.kl_to_ref_coeff,
+        tokenizer=train_tokenizer,
+        tokenizer_kwargs={"chat_template_name": "qwen"},
+        device=torch.device(f"cuda:{train_device}")
+        if train_device is not None
+        else None,
+        loss_function=cfg.train.loss_function,
+        beta=cfg.train.minor_sft_beta,
+    )
+    if cfg.model.compile:
+        loss_fn = torch.compile(loss_fn)
+    # Get vLLM engine from the inference policy
+    # Note: In expert iteration, the inference policy is typically created in get_inference_model
+    # We need to get the vLLM engine from the collector's policy or create it
+    # For now, we'll use the approach similar to GRPO with explicit scheme creation
+    # Create weight sync scheme
+    weight_sync_scheme = make_weight_sync_scheme(
+        master_address="localhost",  # Since we're running locally
+        master_port=None,  # Will auto-assign an open port
+        vllm_tp_size=cfg.inference_model.num_devices
+        if cfg.inference_model.num_devices is not None
+        else len(cfg.inference_model.get("devices", [1])),
+    )
+    # Set up weight sender
+    torchrl_logger.info("Setting up weight synchronization scheme...")
+    sender = weight_sync_scheme.create_sender()
+    sender.register_model(policy_training)
+    # Get vLLM engine reference from collector's policy
+    # The collector has the policy which wraps the vLLM engine
+    vllm_engine = collector.policy.model if hasattr(collector, "policy") else None
+    if vllm_engine is None:
+        raise RuntimeError("Could not get vLLM engine from collector policy")
+    # Initialize collective group
+    torchrl_logger.info("Initializing collective group...")
+    metadata = get_model_metadata(policy_training)
+    sender.init_all_workers_group(metadata, vllm_engine=vllm_engine)
+    # First weight update
+    with timeit("update_policy_weights"):
+        sender.update_weights()
+    timeit.print(prefix="First update_policy_weights_ time")
+    timeit.reset()
+    # Make optimizer
+    torchrl_logger.info("Starting optimizer.")
+    optimizer = torch.optim.Adam(
+        policy_training.parameters(),
+        lr=cfg.optimizer.lr,
+        weight_decay=cfg.optimizer.weight_decay,
+        fused=False,
+    )
+    scaler = GradScaler(enabled=cfg.train.mixed_precision)
+    # Calculate total optimization steps for scheduler
+    # The training loop structure: for each collector iteration, we do cfg.train.epochs epochs
+    # Each epoch processes the entire replay buffer, and optimization happens every gradient_accumulation_steps
+    # We need to estimate the total number of optimization steps
+    # For now, we'll use a conservative estimate based on the total dialog turns
+    # This can be refined based on the actual training dynamics
+    total_optim_steps = (
+        cfg.train.total_dialog_turns
+        * cfg.train.epochs
+        // cfg.train.gradient_accumulation_steps
+    )
+    # Create scheduler if enabled
+    scheduler = None
+    if cfg.optimizer.scheduler.enabled:
+        warmup_steps = cfg.optimizer.scheduler.warmup_steps
+        num_cycles = cfg.optimizer.scheduler.num_cycles
+        torchrl_logger.info(
+            f"Creating {cfg.optimizer.scheduler.type} scheduler with {warmup_steps} warmup steps out of {total_optim_steps} total steps"
+        )
+        scheduler = create_cosine_scheduler_with_warmup(
+            optimizer,
+            num_warmup_steps=warmup_steps,
+            num_training_steps=total_optim_steps,
+            num_cycles=num_cycles,
+        )
+    # Make checkpoint dir
+    checkpoint_dir = Path(cfg.logging.checkpoint_dir)
+    checkpoint_dir.mkdir(parents=True, exist_ok=True)
+    # Make wandb logger
+    torchrl_logger.info("Starting wandb logger.")
+    experiment_name = cfg.logging.experiment_name
+    if experiment_name is not None:
+        experiment_name = [experiment_name]
+    else:
+        experiment_name = []
+    experiment_name.append(cfg.env.dataset)
+    experiment_name.append(cfg.model.name)
+    # Create local wandb logger for training metrics
+    wandb_config = {
+        "project": "ei-async",
+        "exp_name": "-".join(["ei-async"] + experiment_name),
+    }
+    wandb_logger = WandbLogger(**wandb_config)
+    # Pass the logging actor reference to the collector
+    log_queue = Queue(maxsize=1000)
+    collector.set_postproc(RemoteDataLogger(log_queue=log_queue))
+    # Start collector
+    collector.start()
+    # Wait for initial data
+    while not replay_buffer.write_count:
+        time.sleep(1)
+    # Training loop
+    total_steps = (
+        -(cfg.train.total_dialog_turns // -cfg.train.optim_batch_size)
+        * cfg.train.epochs
+    )
+    torchrl_logger.info(f"Total steps: {total_steps}")
+    pbar = tqdm.tqdm(total=total_steps)
+    grad_norm = 0.0  # Initialize grad_norm
+    data_read_count = 0
+    optim_step = 0
+    start_time = time.time()
+    for step in range(total_steps):
+        pbar.update(1)
+        pbar.set_description(f"Step {step}, writes: {replay_buffer.write_count}")
+        with timeit("sampling"):
+            # Sample batch and move to device
+            batch = replay_buffer.sample(cfg.train.optim_batch_size).to(train_device)
+            max_policy_age = (
+                batch.view(-1)[0]["next", "policy_version"] - collector.policy_version
+            ).max()
+            if (
+                cfg.train.max_policy_age is not None
+                and max_policy_age > cfg.train.max_policy_age
+            ):
+                # Skip this batch, as it's too old
+                torchrl_logger.info(f"Skipping batch with policy age {max_policy_age}")
+                continue
+            # For logging purposes, we get the last element of the history
+            # and convert it to a string
+            history: History = batch.view(-1)[0]["next", "history", "prompt"]
+            history_str: list[str] | str = history.apply_chat_template(
+                tokenizer=train_tokenizer
+            )
+            while not isinstance(history_str, str):
+                history_str = "\n".join(history_str)
+            data_read_count += batch.numel()
+        with timeit("forward_pass"):
+            # Forward pass with mixed precision
+            with autocast("cuda", enabled=cfg.train.mixed_precision):
+                loss = loss_fn(batch)
+                if loss.loss_kl_to_ref is not None:
+                    loss_val = loss.loss_sft + loss.loss_kl_to_ref
+                else:
+                    loss_val = loss.loss_sft
+                loss_val = loss_val / cfg.train.gradient_accumulation_steps
+        with timeit("backward_pass"):
+            # Backward pass
+            if cfg.train.mixed_precision and cfg.train_model.torch_dtype == "float16":
+                scaler = GradScaler(enabled=True)
+                scaler.scale(loss_val).backward()
+            else:
+                loss_val.backward()
+        # Optimization step
+        if ((step + 1) % cfg.train.gradient_accumulation_steps) == 0:
+            with timeit("optim_step"):
+                if (
+                    cfg.train.mixed_precision
+                    and cfg.train_model.torch_dtype == "float16"
+                ):
+                    scaler.unscale_(optimizer)
+                grad_norm = torch.nn.utils.clip_grad_norm_(
+                    policy_training.parameters(),
+                    cfg.optimizer.clip_grad_norm,
+                )
+                if (
+                    cfg.train.mixed_precision
+                    and cfg.train_model.torch_dtype == "float16"
+                ):
+                    scaler.step(optimizer)
+                    scaler.update()
+                else:
+                    optimizer.step()
+                optimizer.zero_grad(set_to_none=True)
+                # Step the scheduler
+                if scheduler is not None:
+                    scheduler.step()
+                # Increment optimization step counter
+                optim_step += 1
+        # Update metrics
+        if (step % cfg.train.logging_frequency) == 0:
+            log_training_metrics(
+                wandb_logger=wandb_logger,
+                replay_buffer=replay_buffer,
+                batch=batch,
+                loss=loss,
+                grad_norm=grad_norm,
+                global_step=step,
+                data_read_count=data_read_count,
+                collector=collector,
+                start_time=start_time,
+                gradient_accumulation_steps=cfg.train.gradient_accumulation_steps,
+                history_str=history_str,
+            )
+            # Log additional metrics
+            wandb_logger.log_scalar(
+                "learning_rate", float(optimizer.param_groups[0]["lr"]), step=step
+            )
+            wandb_logger.log_scalar("optim_step", optim_step, step=step)
+            while not log_queue.empty():
+                logs = log_queue.get()
+                for k, v in logs.items():
+                    wandb_logger.log_scalar(k, v)
+        # Update policy weights
+        if step % cfg.train.weight_update_frequency == 0:
+            with timeit("update_policy_weights"):
+                torchrl_logger.info("Updating policy weights...")
+                sender.update_weights()
+                # TODO: do we need this? Does it interfere with other processes?
+                # torch.cuda.empty_cache()
+                gc.collect()
+        # Checkpointing disabled to prevent disk space issues
+        # if (step + 1) % cfg.train.checkpoint_frequency == 0:
+        #     with timeit("save_checkpoint"):
+        #         torchrl_logger.info(
+        #             f"Saving checkpoint {(step+1) // cfg.train.checkpoint_frequency}..."
+        #         )
+        #         checkpoint = {
+        #             "step": step,
+        #             "model_state_dict": policy_training.model.state_dict(),
+        #             "optimizer_state_dict": optimizer.state_dict(),
+        #             "scaler_state_dict": scaler.state_dict(),
+        #             "config": dict(cfg),
+        #         }
+        #         torch.save(checkpoint, checkpoint_dir / f"checkpoint_{step:04d}.pt")
+        if step % cfg.train.weight_update_frequency == 0:
+            timeit.print(prefix="timeit")
+            for key, val in timeit.todict().items():
+                wandb_logger.log_scalar(f"timeit/{key}", val)
+            timeit.reset()
+        # Clear memory
+        del loss_val
+        # TODO: do we need this? Does it interfere with other processes?
+        # torch.cuda.empty_cache()
+        gc.collect()
+    pbar.close()
+    collector.shutdown()
+@hydra.main(version_base=None, config_path="config", config_name="ei_gsm8k")
+def main(cfg):
+    # Force async mode
+    if cfg.train.sync:
+        raise ValueError(
+            "expert-iteration-async.py must run in async mode (`python expert-iteration-async.py mode=async`). Please use expert-iteration-sync.py for sync mode (`python expert-iteration-sync.py mode=sync`)."
+        )
+    # Compute device allocation
+    device_config = compute_device_allocation(cfg)
+    if not ray.is_initialized():
+        # Convert OmegaConf to regular dict and filter out unsupported parameters
+        ray_init_config = {
+            k: dict(v) if isinstance(v, DictConfig) else v
+            for k, v in dict(cfg.ray.init_config).items()
+            if not k.startswith("_")
+        }
+        # Add computed GPU configuration and merge with default runtime_env
+        ray_init_config["num_gpus"] = device_config["ray_num_gpus"]
+        ray_init_config = merge_ray_runtime_env(ray_init_config)
+        torchrl_logger.info(f"Ray init config: {ray_init_config=}")
+        ray.init(**ray_init_config)
+    # Check if num_devices is set
+    if cfg.inference_model.num_devices is None:
+        raise ValueError(
+            "Inference model num_devices must be set via inference_model.num_devices"
+        )
+    if cfg.ref_model.num_devices is None:
+        raise ValueError("Ref model num_devices must be set via ref_model.num_devices")
+    if cfg.train_model.num_devices is None:
+        raise ValueError(
+            "Train model num_devices must be set via train_model.num_devices"
+        )
+    # Convert OmegaConf to regular dict for Ray configs
+    replay_buffer_config = dict(cfg.ray.replay_buffer_config)
+    collector_config = dict(cfg.ray.collector_config)
+    train_handler_config = dict(cfg.ray.train_handler_config)
+    inference_policy = get_inference_model(
+        cfg, devices=device_config["inference_model_devices"]
+    )
+    torchrl_logger.info(f"Inference policy: {inference_policy}")
+    torchrl_logger.info(f"Starting replay buffer with {replay_buffer_config=}")
+    rb_size = cfg.train.buffer_size
+    if rb_size is None:
+        # Hardcoded for now
+        rb_size = 256
+    if cfg.train.prioritized_sampling:
+        rb_cls = TensorDictReplayBuffer
+        rb_sampler_cls = partial(
+            PrioritizedSampler,
+            max_capacity=rb_size,
+            alpha=cfg.train.prioritized_sampling_alpha,
+            beta=cfg.train.prioritized_sampling_beta,
+            eps=cfg.train.prioritized_sampling_epsilon,
+        )
+        kwargs = {"priority_key": ("next", "reward")}
+    else:
+        rb_cls = ReplayBuffer
+        rb_sampler_cls = None
+        kwargs = {}
+    rb = RayReplayBuffer(
+        storage=partial(
+            LazyStackStorage,
+            rb_size,
+            device="cpu",
+        ),
+        transform_factory=partial(
+            TopKRewardSelector,
+            total_dialog_turns=cfg.env.repeats,
+            topk_size=cfg.train.topk_size,
+        ),
+        batch_size=cfg.train.optim_batch_size,
+        remote_config=replay_buffer_config,
+        replay_buffer_cls=rb_cls,
+        sampler=rb_sampler_cls,
+        **kwargs,
+    )
+    torchrl_logger.info(f"Replay buffer: {rb}")
+    # Create remote collector using RayLLMCollector
+    collector_config["num_gpus"] = (
+        # The ref model will be instantiated within the collector, so we only need to allocate the number of devices for the inference model
+        cfg.ref_model.num_devices
+    )
+    torchrl_logger.info(f"Starting collector with {collector_config=}")
+    dialog_turns_per_batch = cfg.train.dialog_turns_per_batch
+    if dialog_turns_per_batch is None:
+        # Hardcoded for now
+        dialog_turns_per_batch = cfg.env.repeats
+    collector = RayLLMCollector(
+        env=partial(make_env, cfg, devices=device_config["ref_model_devices"]),
+        policy=inference_policy,
+        dialog_turns_per_batch=dialog_turns_per_batch,
+        total_dialog_turns=cfg.train.total_dialog_turns,
+        replay_buffer=rb,
+        ray_init_config=None,  # Ray is already initialized
+        weight_updater=None,  # We'll create this after getting the remote LLM
+        track_policy_version=True,
+        remote_config=collector_config,
+        verbose=True,
+    )
+    # Ensure collector is initialized by calling a method that will block until ready
+    ray.get(collector._collector.is_initialized.remote())
+    torchrl_logger.info(f"Collector: {collector}")
+    train_handler_config = {
+        "num_cpus": train_handler_config.get("num_cpus", 1),
+        "num_gpus": cfg.train_model.num_devices,
+    }
+    torchrl_logger.info(f"Starting training handler with {train_handler_config=}")
+    train_handler = ray.remote(
+        **train_handler_config,
+    )(train)
+    # launch training
+    ray.get(
+        train_handler.remote(rb, cfg, collector, device_config["train_model_devices"])
+    )
+if __name__ == "__main__":
+    # Setup environment
+    setup_environment()
+    main()