PyPI - torchrl - Versions diffs - 0.11.0__cp314-cp314-manylinux_2_28_aarch64.whl - Mend

torchrl 0.11.0__cp314-cp314-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (394) hide show

benchmarks/benchmark_batched_envs.py +104 -0
benchmarks/conftest.py +91 -0
benchmarks/ecosystem/gym_env_throughput.py +321 -0
benchmarks/ecosystem/vmas_rllib_vs_torchrl_sampling_performance.py +231 -0
benchmarks/requirements.txt +7 -0
benchmarks/storage/benchmark_sample_latency_over_rpc.py +193 -0
benchmarks/test_collectors_benchmark.py +240 -0
benchmarks/test_compressed_storage_benchmark.py +145 -0
benchmarks/test_envs_benchmark.py +133 -0
benchmarks/test_llm.py +101 -0
benchmarks/test_non_tensor_env_benchmark.py +70 -0
benchmarks/test_objectives_benchmarks.py +1199 -0
benchmarks/test_replaybuffer_benchmark.py +254 -0
sota-check/README.md +35 -0
sota-implementations/README.md +142 -0
sota-implementations/a2c/README.md +39 -0
sota-implementations/a2c/a2c_atari.py +291 -0
sota-implementations/a2c/a2c_mujoco.py +273 -0
sota-implementations/a2c/utils_atari.py +240 -0
sota-implementations/a2c/utils_mujoco.py +160 -0
sota-implementations/bandits/README.md +7 -0
sota-implementations/bandits/dqn.py +126 -0
sota-implementations/cql/cql_offline.py +198 -0
sota-implementations/cql/cql_online.py +249 -0
sota-implementations/cql/discrete_cql_offline.py +180 -0
sota-implementations/cql/discrete_cql_online.py +227 -0
sota-implementations/cql/utils.py +471 -0
sota-implementations/crossq/crossq.py +271 -0
sota-implementations/crossq/utils.py +320 -0
sota-implementations/ddpg/ddpg.py +231 -0
sota-implementations/ddpg/utils.py +325 -0
sota-implementations/decision_transformer/dt.py +163 -0
sota-implementations/decision_transformer/lamb.py +167 -0
sota-implementations/decision_transformer/online_dt.py +178 -0
sota-implementations/decision_transformer/utils.py +562 -0
sota-implementations/discrete_sac/discrete_sac.py +243 -0
sota-implementations/discrete_sac/utils.py +324 -0
sota-implementations/dqn/README.md +30 -0
sota-implementations/dqn/dqn_atari.py +272 -0
sota-implementations/dqn/dqn_cartpole.py +236 -0
sota-implementations/dqn/utils_atari.py +132 -0
sota-implementations/dqn/utils_cartpole.py +90 -0
sota-implementations/dreamer/README.md +129 -0
sota-implementations/dreamer/dreamer.py +586 -0
sota-implementations/dreamer/dreamer_utils.py +1107 -0
sota-implementations/expert-iteration/README.md +352 -0
sota-implementations/expert-iteration/ei_utils.py +770 -0
sota-implementations/expert-iteration/expert-iteration-async.py +512 -0
sota-implementations/expert-iteration/expert-iteration-sync.py +508 -0
sota-implementations/expert-iteration/requirements_gsm8k.txt +13 -0
sota-implementations/expert-iteration/requirements_ifeval.txt +16 -0
sota-implementations/gail/gail.py +327 -0
sota-implementations/gail/gail_utils.py +68 -0
sota-implementations/gail/ppo_utils.py +157 -0
sota-implementations/grpo/README.md +273 -0
sota-implementations/grpo/grpo-async.py +437 -0
sota-implementations/grpo/grpo-sync.py +435 -0
sota-implementations/grpo/grpo_utils.py +843 -0
sota-implementations/grpo/requirements_gsm8k.txt +11 -0
sota-implementations/grpo/requirements_ifeval.txt +16 -0
sota-implementations/impala/README.md +33 -0
sota-implementations/impala/impala_multi_node_ray.py +292 -0
sota-implementations/impala/impala_multi_node_submitit.py +284 -0
sota-implementations/impala/impala_single_node.py +261 -0
sota-implementations/impala/utils.py +184 -0
sota-implementations/iql/discrete_iql.py +230 -0
sota-implementations/iql/iql_offline.py +164 -0
sota-implementations/iql/iql_online.py +225 -0
sota-implementations/iql/utils.py +437 -0
sota-implementations/multiagent/README.md +74 -0
sota-implementations/multiagent/iql.py +237 -0
sota-implementations/multiagent/maddpg_iddpg.py +266 -0
sota-implementations/multiagent/mappo_ippo.py +267 -0
sota-implementations/multiagent/qmix_vdn.py +271 -0
sota-implementations/multiagent/sac.py +337 -0
sota-implementations/multiagent/utils/__init__.py +4 -0
sota-implementations/multiagent/utils/logging.py +151 -0
sota-implementations/multiagent/utils/utils.py +43 -0
sota-implementations/ppo/README.md +29 -0
sota-implementations/ppo/ppo_atari.py +305 -0
sota-implementations/ppo/ppo_mujoco.py +293 -0
sota-implementations/ppo/utils_atari.py +238 -0
sota-implementations/ppo/utils_mujoco.py +152 -0
sota-implementations/ppo_trainer/train.py +21 -0
sota-implementations/redq/README.md +7 -0
sota-implementations/redq/redq.py +199 -0
sota-implementations/redq/utils.py +1060 -0
sota-implementations/sac/sac-async.py +266 -0
sota-implementations/sac/sac.py +239 -0
sota-implementations/sac/utils.py +381 -0
sota-implementations/sac_trainer/train.py +16 -0
sota-implementations/td3/td3.py +254 -0
sota-implementations/td3/utils.py +319 -0
sota-implementations/td3_bc/td3_bc.py +177 -0
sota-implementations/td3_bc/utils.py +251 -0
torchrl/__init__.py +144 -0
torchrl/_extension.py +74 -0
torchrl/_torchrl.cpython-314-aarch64-linux-gnu.so +0 -0
torchrl/_utils.py +1431 -0
torchrl/collectors/__init__.py +48 -0
torchrl/collectors/_base.py +1058 -0
torchrl/collectors/_constants.py +88 -0
torchrl/collectors/_multi_async.py +324 -0
torchrl/collectors/_multi_base.py +1805 -0
torchrl/collectors/_multi_sync.py +464 -0
torchrl/collectors/_runner.py +581 -0
torchrl/collectors/_single.py +2009 -0
torchrl/collectors/_single_async.py +259 -0
torchrl/collectors/collectors.py +62 -0
torchrl/collectors/distributed/__init__.py +32 -0
torchrl/collectors/distributed/default_configs.py +133 -0
torchrl/collectors/distributed/generic.py +1306 -0
torchrl/collectors/distributed/ray.py +1092 -0
torchrl/collectors/distributed/rpc.py +1006 -0
torchrl/collectors/distributed/sync.py +731 -0
torchrl/collectors/distributed/utils.py +160 -0
torchrl/collectors/llm/__init__.py +10 -0
torchrl/collectors/llm/base.py +494 -0
torchrl/collectors/llm/ray_collector.py +275 -0
torchrl/collectors/llm/utils.py +36 -0
torchrl/collectors/llm/weight_update/__init__.py +10 -0
torchrl/collectors/llm/weight_update/vllm.py +348 -0
torchrl/collectors/llm/weight_update/vllm_v2.py +311 -0
torchrl/collectors/utils.py +433 -0
torchrl/collectors/weight_update.py +591 -0
torchrl/csrc/numpy_utils.h +38 -0
torchrl/csrc/pybind.cpp +27 -0
torchrl/csrc/segment_tree.h +458 -0
torchrl/csrc/torch_utils.h +34 -0
torchrl/csrc/utils.cpp +48 -0
torchrl/csrc/utils.h +31 -0
torchrl/data/__init__.py +187 -0
torchrl/data/datasets/__init__.py +58 -0
torchrl/data/datasets/atari_dqn.py +878 -0
torchrl/data/datasets/common.py +281 -0
torchrl/data/datasets/d4rl.py +489 -0
torchrl/data/datasets/d4rl_infos.py +187 -0
torchrl/data/datasets/gen_dgrl.py +375 -0
torchrl/data/datasets/minari_data.py +643 -0
torchrl/data/datasets/openml.py +177 -0
torchrl/data/datasets/openx.py +798 -0
torchrl/data/datasets/roboset.py +363 -0
torchrl/data/datasets/utils.py +11 -0
torchrl/data/datasets/vd4rl.py +432 -0
torchrl/data/llm/__init__.py +34 -0
torchrl/data/llm/dataset.py +491 -0
torchrl/data/llm/history.py +1378 -0
torchrl/data/llm/prompt.py +198 -0
torchrl/data/llm/reward.py +225 -0
torchrl/data/llm/topk.py +186 -0
torchrl/data/llm/utils.py +543 -0
torchrl/data/map/__init__.py +21 -0
torchrl/data/map/hash.py +185 -0
torchrl/data/map/query.py +204 -0
torchrl/data/map/tdstorage.py +363 -0
torchrl/data/map/tree.py +1434 -0
torchrl/data/map/utils.py +103 -0
torchrl/data/postprocs/__init__.py +8 -0
torchrl/data/postprocs/postprocs.py +391 -0
torchrl/data/replay_buffers/__init__.py +99 -0
torchrl/data/replay_buffers/checkpointers.py +622 -0
torchrl/data/replay_buffers/ray_buffer.py +292 -0
torchrl/data/replay_buffers/replay_buffers.py +2376 -0
torchrl/data/replay_buffers/samplers.py +2578 -0
torchrl/data/replay_buffers/scheduler.py +265 -0
torchrl/data/replay_buffers/storages.py +2412 -0
torchrl/data/replay_buffers/utils.py +1042 -0
torchrl/data/replay_buffers/writers.py +781 -0
torchrl/data/tensor_specs.py +7101 -0
torchrl/data/utils.py +334 -0
torchrl/envs/__init__.py +265 -0
torchrl/envs/async_envs.py +1105 -0
torchrl/envs/batched_envs.py +3093 -0
torchrl/envs/common.py +4241 -0
torchrl/envs/custom/__init__.py +11 -0
torchrl/envs/custom/chess.py +617 -0
torchrl/envs/custom/llm.py +214 -0
torchrl/envs/custom/pendulum.py +401 -0
torchrl/envs/custom/san_moves.txt +29274 -0
torchrl/envs/custom/tictactoeenv.py +288 -0
torchrl/envs/env_creator.py +263 -0
torchrl/envs/gym_like.py +752 -0
torchrl/envs/libs/__init__.py +68 -0
torchrl/envs/libs/_gym_utils.py +326 -0
torchrl/envs/libs/brax.py +846 -0
torchrl/envs/libs/dm_control.py +544 -0
torchrl/envs/libs/envpool.py +447 -0
torchrl/envs/libs/gym.py +2239 -0
torchrl/envs/libs/habitat.py +138 -0
torchrl/envs/libs/isaac_lab.py +87 -0
torchrl/envs/libs/isaacgym.py +203 -0
torchrl/envs/libs/jax_utils.py +166 -0
torchrl/envs/libs/jumanji.py +963 -0
torchrl/envs/libs/meltingpot.py +599 -0
torchrl/envs/libs/openml.py +153 -0
torchrl/envs/libs/openspiel.py +652 -0
torchrl/envs/libs/pettingzoo.py +1042 -0
torchrl/envs/libs/procgen.py +351 -0
torchrl/envs/libs/robohive.py +429 -0
torchrl/envs/libs/smacv2.py +645 -0
torchrl/envs/libs/unity_mlagents.py +891 -0
torchrl/envs/libs/utils.py +147 -0
torchrl/envs/libs/vmas.py +813 -0
torchrl/envs/llm/__init__.py +63 -0
torchrl/envs/llm/chat.py +730 -0
torchrl/envs/llm/datasets/README.md +4 -0
torchrl/envs/llm/datasets/__init__.py +17 -0
torchrl/envs/llm/datasets/gsm8k.py +353 -0
torchrl/envs/llm/datasets/ifeval.py +274 -0
torchrl/envs/llm/envs.py +789 -0
torchrl/envs/llm/libs/README.md +3 -0
torchrl/envs/llm/libs/__init__.py +8 -0
torchrl/envs/llm/libs/mlgym.py +869 -0
torchrl/envs/llm/reward/__init__.py +10 -0
torchrl/envs/llm/reward/gsm8k.py +324 -0
torchrl/envs/llm/reward/ifeval/README.md +13 -0
torchrl/envs/llm/reward/ifeval/__init__.py +10 -0
torchrl/envs/llm/reward/ifeval/_instructions.py +1667 -0
torchrl/envs/llm/reward/ifeval/_instructions_main.py +131 -0
torchrl/envs/llm/reward/ifeval/_instructions_registry.py +100 -0
torchrl/envs/llm/reward/ifeval/_instructions_util.py +1677 -0
torchrl/envs/llm/reward/ifeval/_scorer.py +454 -0
torchrl/envs/llm/transforms/__init__.py +55 -0
torchrl/envs/llm/transforms/browser.py +292 -0
torchrl/envs/llm/transforms/dataloading.py +859 -0
torchrl/envs/llm/transforms/format.py +73 -0
torchrl/envs/llm/transforms/kl.py +1544 -0
torchrl/envs/llm/transforms/policy_version.py +189 -0
torchrl/envs/llm/transforms/reason.py +323 -0
torchrl/envs/llm/transforms/tokenizer.py +321 -0
torchrl/envs/llm/transforms/tools.py +1955 -0
torchrl/envs/model_based/__init__.py +9 -0
torchrl/envs/model_based/common.py +180 -0
torchrl/envs/model_based/dreamer.py +112 -0
torchrl/envs/transforms/__init__.py +147 -0
torchrl/envs/transforms/functional.py +48 -0
torchrl/envs/transforms/gym_transforms.py +203 -0
torchrl/envs/transforms/module.py +341 -0
torchrl/envs/transforms/r3m.py +372 -0
torchrl/envs/transforms/ray_service.py +663 -0
torchrl/envs/transforms/rb_transforms.py +214 -0
torchrl/envs/transforms/transforms.py +11835 -0
torchrl/envs/transforms/utils.py +94 -0
torchrl/envs/transforms/vc1.py +307 -0
torchrl/envs/transforms/vecnorm.py +845 -0
torchrl/envs/transforms/vip.py +407 -0
torchrl/envs/utils.py +1718 -0
torchrl/envs/vec_envs.py +11 -0
torchrl/modules/__init__.py +206 -0
torchrl/modules/distributions/__init__.py +73 -0
torchrl/modules/distributions/continuous.py +830 -0
torchrl/modules/distributions/discrete.py +908 -0
torchrl/modules/distributions/truncated_normal.py +187 -0
torchrl/modules/distributions/utils.py +233 -0
torchrl/modules/llm/__init__.py +62 -0
torchrl/modules/llm/backends/__init__.py +65 -0
torchrl/modules/llm/backends/vllm/__init__.py +94 -0
torchrl/modules/llm/backends/vllm/_models.py +46 -0
torchrl/modules/llm/backends/vllm/base.py +72 -0
torchrl/modules/llm/backends/vllm/vllm_async.py +2075 -0
torchrl/modules/llm/backends/vllm/vllm_plugin.py +22 -0
torchrl/modules/llm/backends/vllm/vllm_sync.py +446 -0
torchrl/modules/llm/backends/vllm/vllm_utils.py +129 -0
torchrl/modules/llm/policies/__init__.py +28 -0
torchrl/modules/llm/policies/common.py +1809 -0
torchrl/modules/llm/policies/transformers_wrapper.py +2756 -0
torchrl/modules/llm/policies/vllm_wrapper.py +2241 -0
torchrl/modules/llm/utils.py +23 -0
torchrl/modules/mcts/__init__.py +21 -0
torchrl/modules/mcts/scores.py +579 -0
torchrl/modules/models/__init__.py +86 -0
torchrl/modules/models/batchrenorm.py +119 -0
torchrl/modules/models/decision_transformer.py +179 -0
torchrl/modules/models/exploration.py +731 -0
torchrl/modules/models/llm.py +156 -0
torchrl/modules/models/model_based.py +596 -0
torchrl/modules/models/models.py +1712 -0
torchrl/modules/models/multiagent.py +1067 -0
torchrl/modules/models/recipes/impala.py +185 -0
torchrl/modules/models/utils.py +162 -0
torchrl/modules/planners/__init__.py +10 -0
torchrl/modules/planners/cem.py +228 -0
torchrl/modules/planners/common.py +73 -0
torchrl/modules/planners/mppi.py +265 -0
torchrl/modules/tensordict_module/__init__.py +89 -0
torchrl/modules/tensordict_module/actors.py +2457 -0
torchrl/modules/tensordict_module/common.py +529 -0
torchrl/modules/tensordict_module/exploration.py +814 -0
torchrl/modules/tensordict_module/probabilistic.py +321 -0
torchrl/modules/tensordict_module/rnn.py +1639 -0
torchrl/modules/tensordict_module/sequence.py +132 -0
torchrl/modules/tensordict_module/world_models.py +34 -0
torchrl/modules/utils/__init__.py +38 -0
torchrl/modules/utils/mappings.py +9 -0
torchrl/modules/utils/utils.py +89 -0
torchrl/objectives/__init__.py +78 -0
torchrl/objectives/a2c.py +659 -0
torchrl/objectives/common.py +753 -0
torchrl/objectives/cql.py +1346 -0
torchrl/objectives/crossq.py +710 -0
torchrl/objectives/ddpg.py +453 -0
torchrl/objectives/decision_transformer.py +371 -0
torchrl/objectives/deprecated.py +516 -0
torchrl/objectives/dqn.py +683 -0
torchrl/objectives/dreamer.py +488 -0
torchrl/objectives/functional.py +48 -0
torchrl/objectives/gail.py +258 -0
torchrl/objectives/iql.py +996 -0
torchrl/objectives/llm/__init__.py +30 -0
torchrl/objectives/llm/grpo.py +846 -0
torchrl/objectives/llm/sft.py +482 -0
torchrl/objectives/multiagent/__init__.py +8 -0
torchrl/objectives/multiagent/qmixer.py +396 -0
torchrl/objectives/ppo.py +1669 -0
torchrl/objectives/redq.py +683 -0
torchrl/objectives/reinforce.py +530 -0
torchrl/objectives/sac.py +1580 -0
torchrl/objectives/td3.py +570 -0
torchrl/objectives/td3_bc.py +625 -0
torchrl/objectives/utils.py +782 -0
torchrl/objectives/value/__init__.py +28 -0
torchrl/objectives/value/advantages.py +1956 -0
torchrl/objectives/value/functional.py +1459 -0
torchrl/objectives/value/utils.py +360 -0
torchrl/record/__init__.py +17 -0
torchrl/record/loggers/__init__.py +23 -0
torchrl/record/loggers/common.py +48 -0
torchrl/record/loggers/csv.py +226 -0
torchrl/record/loggers/mlflow.py +142 -0
torchrl/record/loggers/tensorboard.py +139 -0
torchrl/record/loggers/trackio.py +163 -0
torchrl/record/loggers/utils.py +78 -0
torchrl/record/loggers/wandb.py +214 -0
torchrl/record/recorder.py +554 -0
torchrl/services/__init__.py +79 -0
torchrl/services/base.py +109 -0
torchrl/services/ray_service.py +453 -0
torchrl/testing/__init__.py +107 -0
torchrl/testing/assertions.py +179 -0
torchrl/testing/dist_utils.py +122 -0
torchrl/testing/env_creators.py +227 -0
torchrl/testing/env_helper.py +35 -0
torchrl/testing/gym_helpers.py +156 -0
torchrl/testing/llm_mocks.py +119 -0
torchrl/testing/mocking_classes.py +2720 -0
torchrl/testing/modules.py +295 -0
torchrl/testing/mp_helpers.py +15 -0
torchrl/testing/ray_helpers.py +293 -0
torchrl/testing/utils.py +190 -0
torchrl/trainers/__init__.py +42 -0
torchrl/trainers/algorithms/__init__.py +11 -0
torchrl/trainers/algorithms/configs/__init__.py +705 -0
torchrl/trainers/algorithms/configs/collectors.py +216 -0
torchrl/trainers/algorithms/configs/common.py +41 -0
torchrl/trainers/algorithms/configs/data.py +308 -0
torchrl/trainers/algorithms/configs/envs.py +104 -0
torchrl/trainers/algorithms/configs/envs_libs.py +361 -0
torchrl/trainers/algorithms/configs/logging.py +80 -0
torchrl/trainers/algorithms/configs/modules.py +570 -0
torchrl/trainers/algorithms/configs/objectives.py +177 -0
torchrl/trainers/algorithms/configs/trainers.py +340 -0
torchrl/trainers/algorithms/configs/transforms.py +955 -0
torchrl/trainers/algorithms/configs/utils.py +252 -0
torchrl/trainers/algorithms/configs/weight_sync_schemes.py +191 -0
torchrl/trainers/algorithms/configs/weight_update.py +159 -0
torchrl/trainers/algorithms/ppo.py +373 -0
torchrl/trainers/algorithms/sac.py +308 -0
torchrl/trainers/helpers/__init__.py +40 -0
torchrl/trainers/helpers/collectors.py +416 -0
torchrl/trainers/helpers/envs.py +573 -0
torchrl/trainers/helpers/logger.py +33 -0
torchrl/trainers/helpers/losses.py +132 -0
torchrl/trainers/helpers/models.py +658 -0
torchrl/trainers/helpers/replay_buffer.py +59 -0
torchrl/trainers/helpers/trainers.py +301 -0
torchrl/trainers/trainers.py +2052 -0
torchrl/weight_update/__init__.py +33 -0
torchrl/weight_update/_distributed.py +749 -0
torchrl/weight_update/_mp.py +624 -0
torchrl/weight_update/_noupdate.py +102 -0
torchrl/weight_update/_ray.py +1032 -0
torchrl/weight_update/_rpc.py +284 -0
torchrl/weight_update/_shared.py +891 -0
torchrl/weight_update/llm/__init__.py +32 -0
torchrl/weight_update/llm/vllm_double_buffer.py +370 -0
torchrl/weight_update/llm/vllm_nccl.py +710 -0
torchrl/weight_update/utils.py +73 -0
torchrl/weight_update/weight_sync_schemes.py +1244 -0
torchrl-0.11.0.dist-info/METADATA +1308 -0
torchrl-0.11.0.dist-info/RECORD +394 -0
torchrl-0.11.0.dist-info/WHEEL +5 -0
torchrl-0.11.0.dist-info/entry_points.txt +2 -0
torchrl-0.11.0.dist-info/licenses/LICENSE +21 -0
torchrl-0.11.0.dist-info/top_level.txt +7 -0

sota-implementations/expert-iteration/expert-iteration-sync.py ADDED Viewed

@@ -0,0 +1,508 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+import gc
+import math
+from functools import partial
+from pathlib import Path
+import hydra
+from torchrl import merge_ray_runtime_env, torchrl_logger
+from torchrl.data.llm.history import History
+from torchrl.record.loggers.wandb import WandbLogger
+from torchrl.weight_update.llm import get_model_metadata
+try:
+    import ray
+except ImportError:
+    raise ImportError(
+        "Ray is required for sync training. Please install ray with `pip install ray`."
+    )
+import time
+import torch
+import tqdm
+from ei_utils import (
+    compute_device_allocation,
+    create_cosine_scheduler_with_warmup,
+    get_inference_model,
+    get_train_model,
+    log_training_metrics,
+    make_env,
+    make_weight_sync_scheme,
+    RemoteDataLogger,
+)
+from omegaconf import DictConfig
+from ray.util.queue import Queue
+try:
+    from tensordict import set_list_to_stack
+except ImportError:
+    raise ImportError(
+        "TensorDict is required. Please install it with `pip install tensordict`."
+    )
+from torch.amp.autocast_mode import autocast
+from torch.amp.grad_scaler import GradScaler
+from torchrl._utils import timeit
+from torchrl.collectors.llm import RayLLMCollector
+from torchrl.data import LazyStackStorage, ReplayBuffer, SamplerWithoutReplacement
+from torchrl.data.llm.topk import TopKRewardSelector
+from torchrl.data.replay_buffers.ray_buffer import RayReplayBuffer
+from torchrl.objectives.llm.sft import SFTLoss
+DEFAULT_DIALOG_TURNS_PER_BATCH = 256
+def setup_environment() -> None:
+    """Setup required environment variables and configurations."""
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required for training")
+    # Set default dtype to float32 for mixed precision training
+    torch.set_default_dtype(torch.float32)
+    torch.set_default_device("cuda:0")
+    set_list_to_stack(True).set()
+    # Ensure CUDA is using the correct dtype
+    if torch.cuda.is_available():
+        torch.cuda.set_device("cuda:0")
+def train(
+    replay_buffer: ReplayBuffer,
+    cfg: DictConfig,
+    collector: RayLLMCollector,
+    devices: list[int] | None = None,
+):
+    """Main training loop for EI sync.
+    This function implements synchronous training where data collection and optimization
+    happen in separate, consecutive steps. The total number of steps is determined by the number of epochs,
+    samples per epoch, and batches collected.
+    Args:
+        replay_buffer: The replay buffer to store experiences. The sampler will typically be a `SamplerWithoutReplacement`.
+        cfg: The configuration object containing training parameters
+        collector: The collector object.
+        devices: The devices to use for the training model.
+    """
+    # Setup training model and tokenizer
+    policy_training, train_tokenizer = get_train_model(
+        cfg, devices=devices, chat_template_name="qwen"
+    )
+    train_device = devices[0]  # Use first device for batch processing
+    # Setup loss function
+    loss_fn = SFTLoss(
+        actor_network=policy_training,
+        kl_to_ref_coeff=cfg.train.kl_to_ref_coeff,
+        tokenizer=train_tokenizer,
+        tokenizer_kwargs={"chat_template_name": "qwen"},
+        device=torch.device(f"cuda:{train_device}")
+        if train_device is not None
+        else None,
+        loss_function=cfg.train.loss_function,
+        beta=cfg.train.minor_sft_beta,
+    )
+    if cfg.model.compile:
+        loss_fn = torch.compile(loss_fn)
+    # Create weight sync scheme
+    weight_sync_scheme = make_weight_sync_scheme(
+        master_address="localhost",  # Since we're running locally
+        master_port=None,  # Will auto-assign an open port
+        vllm_tp_size=cfg.inference_model.num_devices
+        if cfg.inference_model.num_devices is not None
+        else len(cfg.inference_model.get("devices", [1])),
+    )
+    # Set up weight sender
+    torchrl_logger.info("Setting up weight synchronization scheme...")
+    sender = weight_sync_scheme.create_sender()
+    sender.register_model(policy_training)
+    # Get vLLM engine reference from collector's policy
+    vllm_engine = collector.policy.model if hasattr(collector, "policy") else None
+    if vllm_engine is None:
+        raise RuntimeError("Could not get vLLM engine from collector policy")
+    # Initialize collective group
+    torchrl_logger.info("Initializing collective group...")
+    metadata = get_model_metadata(policy_training)
+    sender.init_all_workers_group(metadata, vllm_engine=vllm_engine)
+    # First weight update
+    with timeit("update_policy_weights"):
+        sender.update_weights()
+    timeit.print(prefix="First update_policy_weights_ time")
+    timeit.reset()
+    # Make optimizer
+    torchrl_logger.info("Starting optimizer.")
+    optimizer = torch.optim.Adam(
+        policy_training.parameters(),
+        lr=cfg.optimizer.lr,
+        weight_decay=cfg.optimizer.weight_decay,
+        fused=False,
+    )
+    scaler = GradScaler(enabled=cfg.train.mixed_precision)
+    # Calculate total optimization steps for scheduler
+    # The training loop structure: for each collector iteration, we do cfg.train.epochs epochs
+    # Each epoch processes the entire replay buffer, and optimization happens every gradient_accumulation_steps
+    # We need to estimate the total number of optimization steps
+    # For now, we'll use a conservative estimate based on the total dialog turns
+    # This can be refined based on the actual training dynamics
+    total_optim_steps = (
+        cfg.train.total_dialog_turns
+        * cfg.train.epochs
+        // cfg.train.gradient_accumulation_steps
+    )
+    # Create scheduler if enabled
+    scheduler = None
+    if cfg.optimizer.scheduler.enabled:
+        warmup_steps = cfg.optimizer.scheduler.warmup_steps
+        num_cycles = cfg.optimizer.scheduler.num_cycles
+        torchrl_logger.info(
+            f"Creating {cfg.optimizer.scheduler.type} scheduler with {warmup_steps} warmup steps out of {total_optim_steps} total steps"
+        )
+        scheduler = create_cosine_scheduler_with_warmup(
+            optimizer,
+            num_warmup_steps=warmup_steps,
+            num_training_steps=total_optim_steps,
+            num_cycles=num_cycles,
+        )
+    # Make checkpoint dir
+    checkpoint_dir = Path(cfg.logging.checkpoint_dir)
+    checkpoint_dir.mkdir(parents=True, exist_ok=True)
+    # Make wandb logger
+    torchrl_logger.info("Starting wandb logger.")
+    experiment_name = cfg.logging.experiment_name
+    if experiment_name is not None:
+        experiment_name = [experiment_name]
+    else:
+        experiment_name = []
+    experiment_name.append(cfg.env.dataset)
+    experiment_name.append(cfg.model.name)
+    # Create local wandb logger for training metrics
+    wandb_config = {
+        "project": "ei-sync",
+        "exp_name": "-".join(["ei-sync"] + experiment_name),
+    }
+    wandb_logger = WandbLogger(**wandb_config)
+    # Pass the logging actor reference to the collector
+    log_queue = Queue(maxsize=1000)
+    collector.set_postproc(RemoteDataLogger(log_queue=log_queue))
+    # Training loop
+    torchrl_logger.info("Starting training loop.")
+    pbar = tqdm.tqdm(total=cfg.train.total_dialog_turns)
+    grad_norm = 0.0  # Initialize grad_norm
+    data_read_count = 0
+    global_step = 0
+    optim_step = 0  # Track optimization steps separately for scheduler
+    start_time = time.time()
+    write_count = replay_buffer.write_count
+    for data in collector:
+        new_write_count = replay_buffer.write_count
+        if new_write_count == write_count:
+            torchrl_logger.warning("No new writes to replay buffer")
+            continue
+        pbar.update(new_write_count - write_count)
+        write_count = new_write_count
+        # data is None as the collector directly writes to the replay buffer
+        if data is not None:
+            raise ValueError("Data is not None")
+        for _ in range(cfg.train.epochs):
+            # Iterate over the replay buffer
+            for batch in replay_buffer:
+                batch = batch.to(train_device)
+                global_step += 1
+                pbar.set_description(
+                    f"Gradient step {global_step}, writes: {replay_buffer.write_count}, batch size: {batch.shape}"
+                )
+                # For logging purposes, we get the last element of the history
+                # and convert it to a string
+                history: History = batch.view(-1)[0]["next", "history", "prompt"]
+                history_str: list[str] | str = history.apply_chat_template(
+                    tokenizer=train_tokenizer
+                )
+                while not isinstance(history_str, str):
+                    history_str = "\n".join(history_str)
+                data_read_count += batch.numel()
+                with timeit("forward_pass"):
+                    # Forward pass with mixed precision
+                    with autocast("cuda", enabled=cfg.train.mixed_precision):
+                        loss = loss_fn(batch)
+                        if loss.loss_kl_to_ref is not None:
+                            loss_val = loss.loss_sft + loss.loss_kl_to_ref
+                        else:
+                            loss_val = loss.loss_sft
+                        loss_val = loss_val / cfg.train.gradient_accumulation_steps
+                with timeit("backward_pass"):
+                    # Backward pass
+                    if (
+                        cfg.train.mixed_precision
+                        and cfg.train_model.torch_dtype == "float16"
+                    ):
+                        scaler = GradScaler(enabled=True)
+                        scaler.scale(loss_val).backward()
+                    else:
+                        loss_val.backward()
+                # Optimization step
+                if ((global_step + 1) % cfg.train.gradient_accumulation_steps) == 0:
+                    with timeit("optim_step"):
+                        if (
+                            cfg.train.mixed_precision
+                            and cfg.train_model.torch_dtype == "float16"
+                        ):
+                            scaler.unscale_(optimizer)
+                        grad_norm = torch.nn.utils.clip_grad_norm_(
+                            policy_training.parameters(),
+                            cfg.optimizer.clip_grad_norm,
+                        )
+                        if (
+                            cfg.train.mixed_precision
+                            and cfg.train_model.torch_dtype == "float16"
+                        ):
+                            scaler.step(optimizer)
+                            scaler.update()
+                        else:
+                            optimizer.step()
+                        optimizer.zero_grad(set_to_none=True)
+                        # Step the scheduler
+                        if scheduler is not None:
+                            scheduler.step()
+                        # Increment optimization step counter
+                        optim_step += 1
+                # Clear memory
+                del loss_val
+                torch.cuda.empty_cache()
+                gc.collect()
+                # Update metrics
+                if (global_step % cfg.train.logging_frequency) == 0:
+                    log_training_metrics(
+                        wandb_logger=wandb_logger,
+                        replay_buffer=replay_buffer,
+                        batch=batch,
+                        loss=loss,
+                        grad_norm=grad_norm,
+                        global_step=global_step,
+                        data_read_count=data_read_count,
+                        collector=collector,
+                        start_time=start_time,
+                        gradient_accumulation_steps=cfg.train.gradient_accumulation_steps,
+                        history_str=history_str,
+                    )
+                    # Log additional metrics
+                    wandb_logger.log_scalar(
+                        "learning_rate",
+                        float(optimizer.param_groups[0]["lr"]),
+                        step=global_step,
+                    )
+                    wandb_logger.log_scalar("optim_step", optim_step, step=global_step)
+                    while not log_queue.empty():
+                        logs = log_queue.get()
+                        for k, v in logs.items():
+                            wandb_logger.log_scalar(k, v, step=global_step)
+                # Update policy weights
+                if (
+                    cfg.train.weight_update_frequency is not None
+                    and (global_step + 1) % cfg.train.weight_update_frequency == 0
+                ):
+                    with timeit("update_policy_weights"):
+                        torchrl_logger.info("Updating policy weights...")
+                        sender.update_weights()
+                        torch.cuda.empty_cache()
+                        gc.collect()
+                # Checkpointing disabled to prevent disk space issues
+                # if (global_step + 1) % cfg.train.checkpoint_frequency == 0:
+                #     with timeit("save_checkpoint"):
+                #         torchrl_logger.info(
+                #             f"Saving checkpoint {(global_step+1) // cfg.train.checkpoint_frequency}..."
+                #         )
+                #         checkpoint = {
+                #             "step": global_step,
+                #             "model_state_dict": policy_training.model.state_dict(),
+                #             "optimizer_state_dict": optimizer.state_dict(),
+                #             "scaler_state_dict": scaler.state_dict(),
+                #             "config": dict(cfg),
+                #         }
+                #         torch.save(checkpoint, checkpoint_dir / f"checkpoint_{global_step:04d}.pt")
+        # Update policy weights
+        if cfg.train.weight_update_frequency is None:
+            # If weight_update_frequency is not set, we update the weights after each batch
+            with timeit("update_policy_weights"):
+                torchrl_logger.info("Updating policy weights...")
+                sender.update_weights()
+                torch.cuda.empty_cache()
+                gc.collect()
+        timeit.print(prefix="timeit")
+        for key, val in timeit.todict().items():
+            wandb_logger.log_scalar(f"timeit/{key}", val)
+        timeit.reset()
+        if cfg.train.empty_replay_buffer:
+            replay_buffer.empty(empty_write_count=False)
+    pbar.close()
+    collector.shutdown()
+@hydra.main(version_base=None, config_path="config", config_name="ei_gsm8k")
+def main(cfg):
+    # Force sync mode
+    if not cfg.train.sync:
+        raise ValueError(
+            "expert-iteration-sync.py must run in sync mode (`python expert-iteration-sync.py mode=sync`). Please use expert-iteration-async.py for async mode (`python expert-iteration-async.py mode=async`)."
+        )
+    # Compute device allocation
+    device_config = compute_device_allocation(cfg)
+    if not ray.is_initialized():
+        # Convert OmegaConf to regular dict and filter out unsupported parameters
+        ray_init_config = {
+            k: dict(v) if isinstance(v, DictConfig) else v
+            for k, v in dict(cfg.ray.init_config).items()
+            if not k.startswith("_")
+        }
+        # Add computed GPU configuration and merge with default runtime_env
+        ray_init_config["num_gpus"] = device_config["ray_num_gpus"]
+        ray_init_config = merge_ray_runtime_env(ray_init_config)
+        torchrl_logger.info(f"Ray init config: {ray_init_config=}")
+        ray.init(**ray_init_config)
+    # Check if num_devices is set
+    if cfg.inference_model.num_devices is None:
+        raise ValueError(
+            "Inference model num_devices must be set via inference_model.num_devices"
+        )
+    if cfg.ref_model.num_devices is None:
+        raise ValueError("Ref model num_devices must be set via ref_model.num_devices")
+    if cfg.train_model.num_devices is None:
+        raise ValueError(
+            "Train model num_devices must be set via train_model.num_devices"
+        )
+    # Convert OmegaConf to regular dict for Ray configs
+    replay_buffer_config = dict(cfg.ray.replay_buffer_config)
+    collector_config = dict(cfg.ray.collector_config)
+    train_handler_config = dict(cfg.ray.train_handler_config)
+    inference_policy = get_inference_model(
+        cfg, devices=device_config["inference_model_devices"]
+    )
+    torchrl_logger.info(f"Inference policy: {inference_policy}")
+    torchrl_logger.info(f"Starting replay buffer with {replay_buffer_config=}")
+    rb_size = cfg.train.buffer_size
+    if rb_size is None:
+        if cfg.train.empty_replay_buffer:
+            # we can just set a big number, the buffer will be emptied anyway
+            rb_size = 1000000
+        else:
+            dialog_turns_per_batch = cfg.train.dialog_turns_per_batch
+            if dialog_turns_per_batch is None:
+                dialog_turns_per_batch = DEFAULT_DIALOG_TURNS_PER_BATCH
+            rb_size = int(
+                math.ceil(
+                    dialog_turns_per_batch * cfg.train.topk_size / cfg.env.repeats
+                )
+            )
+    rb = RayReplayBuffer(
+        storage=partial(
+            LazyStackStorage,
+            rb_size,
+            device="cpu",
+        ),
+        sampler=SamplerWithoutReplacement,
+        transform_factory=partial(
+            TopKRewardSelector,
+            total_dialog_turns=cfg.env.repeats,
+            topk_size=cfg.train.topk_size,
+        ),
+        batch_size=cfg.train.optim_batch_size,
+        remote_config=replay_buffer_config,
+    )
+    torchrl_logger.info(f"Replay buffer: {rb}")
+    # Create remote collector using RayLLMCollector
+    collector_config["num_gpus"] = (
+        # The ref model will be instantiated within the collector, so we only need to allocate the number of devices for the inference model
+        cfg.ref_model.num_devices
+    )
+    torchrl_logger.info(f"Starting collector with {collector_config=}")
+    dialog_turns_per_batch = cfg.train.dialog_turns_per_batch
+    if dialog_turns_per_batch is None:
+        # Hardcoded for now
+        dialog_turns_per_batch = DEFAULT_DIALOG_TURNS_PER_BATCH
+    collector = RayLLMCollector(
+        env=partial(make_env, cfg, devices=device_config["ref_model_devices"]),
+        policy=inference_policy,
+        dialog_turns_per_batch=dialog_turns_per_batch,
+        total_dialog_turns=cfg.train.total_dialog_turns,
+        replay_buffer=rb,
+        ray_init_config=None,  # Ray is already initialized
+        weight_updater=None,  # We'll create this after getting the remote LLM
+        track_policy_version=True,
+        remote_config=collector_config,
+        sync_iter=cfg.train.sync_iter,
+        verbose=True,
+    )
+    # Ensure collector is initialized by calling a method that will block until ready
+    ray.get(collector._collector.is_initialized.remote())
+    torchrl_logger.info(f"Collector: {collector}")
+    train_handler_config = {
+        "num_cpus": train_handler_config.get("num_cpus", 1),
+        "num_gpus": cfg.train_model.num_devices,
+    }
+    torchrl_logger.info(f"Starting training handler with {train_handler_config=}")
+    train_handler = ray.remote(
+        **train_handler_config,
+    )(train)
+    # launch training
+    ray.get(
+        train_handler.remote(rb, cfg, collector, device_config["train_model_devices"])
+    )
+if __name__ == "__main__":
+    # Setup environment
+    setup_environment()
+    main()

sota-implementations/expert-iteration/requirements_gsm8k.txt ADDED Viewed

@@ -0,0 +1,13 @@
+torch==2.7.0
+transformers==4.52.4
+peft==0.15.2
+bitsandbytes==0.46.0
+datasets==3.6.0
+wandb==0.19.11
+hydra-core==1.3.2
+ray==2.52.1
+tqdm==4.67.1
+tensordict==0.9.0
+vllm==0.9.0.1
+accelerate==1.7.0
+xformers==0.0.30

sota-implementations/expert-iteration/requirements_ifeval.txt ADDED Viewed

@@ -0,0 +1,16 @@
+torch==2.7.0
+transformers==4.52.4
+peft==0.15.2
+bitsandbytes==0.46.0
+datasets==3.6.0
+wandb==0.19.11
+hydra-core==1.3.2
+ray==2.52.1
+tqdm==4.67.1
+tensordict==0.9.0
+vllm==0.9.0.1
+accelerate==1.7.0
+xformers==0.0.30
+nltk==3.9.1
+langdetect==1.0.9
+immutabledict==4.2.1