PyPI - torchrl - Versions diffs - 0.11.0__cp314-cp314t-macosx_11_0_arm64.whl - Mend

torchrl 0.11.0__cp314-cp314t-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (395) hide show

benchmarks/benchmark_batched_envs.py +104 -0
benchmarks/conftest.py +91 -0
benchmarks/ecosystem/gym_env_throughput.py +321 -0
benchmarks/ecosystem/vmas_rllib_vs_torchrl_sampling_performance.py +231 -0
benchmarks/requirements.txt +7 -0
benchmarks/storage/benchmark_sample_latency_over_rpc.py +193 -0
benchmarks/test_collectors_benchmark.py +240 -0
benchmarks/test_compressed_storage_benchmark.py +145 -0
benchmarks/test_envs_benchmark.py +133 -0
benchmarks/test_llm.py +101 -0
benchmarks/test_non_tensor_env_benchmark.py +70 -0
benchmarks/test_objectives_benchmarks.py +1199 -0
benchmarks/test_replaybuffer_benchmark.py +254 -0
sota-check/README.md +35 -0
sota-implementations/README.md +142 -0
sota-implementations/a2c/README.md +39 -0
sota-implementations/a2c/a2c_atari.py +291 -0
sota-implementations/a2c/a2c_mujoco.py +273 -0
sota-implementations/a2c/utils_atari.py +240 -0
sota-implementations/a2c/utils_mujoco.py +160 -0
sota-implementations/bandits/README.md +7 -0
sota-implementations/bandits/dqn.py +126 -0
sota-implementations/cql/cql_offline.py +198 -0
sota-implementations/cql/cql_online.py +249 -0
sota-implementations/cql/discrete_cql_offline.py +180 -0
sota-implementations/cql/discrete_cql_online.py +227 -0
sota-implementations/cql/utils.py +471 -0
sota-implementations/crossq/crossq.py +271 -0
sota-implementations/crossq/utils.py +320 -0
sota-implementations/ddpg/ddpg.py +231 -0
sota-implementations/ddpg/utils.py +325 -0
sota-implementations/decision_transformer/dt.py +163 -0
sota-implementations/decision_transformer/lamb.py +167 -0
sota-implementations/decision_transformer/online_dt.py +178 -0
sota-implementations/decision_transformer/utils.py +562 -0
sota-implementations/discrete_sac/discrete_sac.py +243 -0
sota-implementations/discrete_sac/utils.py +324 -0
sota-implementations/dqn/README.md +30 -0
sota-implementations/dqn/dqn_atari.py +272 -0
sota-implementations/dqn/dqn_cartpole.py +236 -0
sota-implementations/dqn/utils_atari.py +132 -0
sota-implementations/dqn/utils_cartpole.py +90 -0
sota-implementations/dreamer/README.md +129 -0
sota-implementations/dreamer/dreamer.py +586 -0
sota-implementations/dreamer/dreamer_utils.py +1107 -0
sota-implementations/expert-iteration/README.md +352 -0
sota-implementations/expert-iteration/ei_utils.py +770 -0
sota-implementations/expert-iteration/expert-iteration-async.py +512 -0
sota-implementations/expert-iteration/expert-iteration-sync.py +508 -0
sota-implementations/expert-iteration/requirements_gsm8k.txt +13 -0
sota-implementations/expert-iteration/requirements_ifeval.txt +16 -0
sota-implementations/gail/gail.py +327 -0
sota-implementations/gail/gail_utils.py +68 -0
sota-implementations/gail/ppo_utils.py +157 -0
sota-implementations/grpo/README.md +273 -0
sota-implementations/grpo/grpo-async.py +437 -0
sota-implementations/grpo/grpo-sync.py +435 -0
sota-implementations/grpo/grpo_utils.py +843 -0
sota-implementations/grpo/requirements_gsm8k.txt +11 -0
sota-implementations/grpo/requirements_ifeval.txt +16 -0
sota-implementations/impala/README.md +33 -0
sota-implementations/impala/impala_multi_node_ray.py +292 -0
sota-implementations/impala/impala_multi_node_submitit.py +284 -0
sota-implementations/impala/impala_single_node.py +261 -0
sota-implementations/impala/utils.py +184 -0
sota-implementations/iql/discrete_iql.py +230 -0
sota-implementations/iql/iql_offline.py +164 -0
sota-implementations/iql/iql_online.py +225 -0
sota-implementations/iql/utils.py +437 -0
sota-implementations/multiagent/README.md +74 -0
sota-implementations/multiagent/iql.py +237 -0
sota-implementations/multiagent/maddpg_iddpg.py +266 -0
sota-implementations/multiagent/mappo_ippo.py +267 -0
sota-implementations/multiagent/qmix_vdn.py +271 -0
sota-implementations/multiagent/sac.py +337 -0
sota-implementations/multiagent/utils/__init__.py +4 -0
sota-implementations/multiagent/utils/logging.py +151 -0
sota-implementations/multiagent/utils/utils.py +43 -0
sota-implementations/ppo/README.md +29 -0
sota-implementations/ppo/ppo_atari.py +305 -0
sota-implementations/ppo/ppo_mujoco.py +293 -0
sota-implementations/ppo/utils_atari.py +238 -0
sota-implementations/ppo/utils_mujoco.py +152 -0
sota-implementations/ppo_trainer/train.py +21 -0
sota-implementations/redq/README.md +7 -0
sota-implementations/redq/redq.py +199 -0
sota-implementations/redq/utils.py +1060 -0
sota-implementations/sac/sac-async.py +266 -0
sota-implementations/sac/sac.py +239 -0
sota-implementations/sac/utils.py +381 -0
sota-implementations/sac_trainer/train.py +16 -0
sota-implementations/td3/td3.py +254 -0
sota-implementations/td3/utils.py +319 -0
sota-implementations/td3_bc/td3_bc.py +177 -0
sota-implementations/td3_bc/utils.py +251 -0
torchrl/.dylibs/libc++.1.0.dylib +0 -0
torchrl/__init__.py +144 -0
torchrl/_extension.py +74 -0
torchrl/_torchrl.cpython-314t-darwin.so +0 -0
torchrl/_utils.py +1431 -0
torchrl/collectors/__init__.py +48 -0
torchrl/collectors/_base.py +1058 -0
torchrl/collectors/_constants.py +88 -0
torchrl/collectors/_multi_async.py +324 -0
torchrl/collectors/_multi_base.py +1805 -0
torchrl/collectors/_multi_sync.py +464 -0
torchrl/collectors/_runner.py +581 -0
torchrl/collectors/_single.py +2009 -0
torchrl/collectors/_single_async.py +259 -0
torchrl/collectors/collectors.py +62 -0
torchrl/collectors/distributed/__init__.py +32 -0
torchrl/collectors/distributed/default_configs.py +133 -0
torchrl/collectors/distributed/generic.py +1306 -0
torchrl/collectors/distributed/ray.py +1092 -0
torchrl/collectors/distributed/rpc.py +1006 -0
torchrl/collectors/distributed/sync.py +731 -0
torchrl/collectors/distributed/utils.py +160 -0
torchrl/collectors/llm/__init__.py +10 -0
torchrl/collectors/llm/base.py +494 -0
torchrl/collectors/llm/ray_collector.py +275 -0
torchrl/collectors/llm/utils.py +36 -0
torchrl/collectors/llm/weight_update/__init__.py +10 -0
torchrl/collectors/llm/weight_update/vllm.py +348 -0
torchrl/collectors/llm/weight_update/vllm_v2.py +311 -0
torchrl/collectors/utils.py +433 -0
torchrl/collectors/weight_update.py +591 -0
torchrl/csrc/numpy_utils.h +38 -0
torchrl/csrc/pybind.cpp +27 -0
torchrl/csrc/segment_tree.h +458 -0
torchrl/csrc/torch_utils.h +34 -0
torchrl/csrc/utils.cpp +48 -0
torchrl/csrc/utils.h +31 -0
torchrl/data/__init__.py +187 -0
torchrl/data/datasets/__init__.py +58 -0
torchrl/data/datasets/atari_dqn.py +878 -0
torchrl/data/datasets/common.py +281 -0
torchrl/data/datasets/d4rl.py +489 -0
torchrl/data/datasets/d4rl_infos.py +187 -0
torchrl/data/datasets/gen_dgrl.py +375 -0
torchrl/data/datasets/minari_data.py +643 -0
torchrl/data/datasets/openml.py +177 -0
torchrl/data/datasets/openx.py +798 -0
torchrl/data/datasets/roboset.py +363 -0
torchrl/data/datasets/utils.py +11 -0
torchrl/data/datasets/vd4rl.py +432 -0
torchrl/data/llm/__init__.py +34 -0
torchrl/data/llm/dataset.py +491 -0
torchrl/data/llm/history.py +1378 -0
torchrl/data/llm/prompt.py +198 -0
torchrl/data/llm/reward.py +225 -0
torchrl/data/llm/topk.py +186 -0
torchrl/data/llm/utils.py +543 -0
torchrl/data/map/__init__.py +21 -0
torchrl/data/map/hash.py +185 -0
torchrl/data/map/query.py +204 -0
torchrl/data/map/tdstorage.py +363 -0
torchrl/data/map/tree.py +1434 -0
torchrl/data/map/utils.py +103 -0
torchrl/data/postprocs/__init__.py +8 -0
torchrl/data/postprocs/postprocs.py +391 -0
torchrl/data/replay_buffers/__init__.py +99 -0
torchrl/data/replay_buffers/checkpointers.py +622 -0
torchrl/data/replay_buffers/ray_buffer.py +292 -0
torchrl/data/replay_buffers/replay_buffers.py +2376 -0
torchrl/data/replay_buffers/samplers.py +2578 -0
torchrl/data/replay_buffers/scheduler.py +265 -0
torchrl/data/replay_buffers/storages.py +2412 -0
torchrl/data/replay_buffers/utils.py +1042 -0
torchrl/data/replay_buffers/writers.py +781 -0
torchrl/data/tensor_specs.py +7101 -0
torchrl/data/utils.py +334 -0
torchrl/envs/__init__.py +265 -0
torchrl/envs/async_envs.py +1105 -0
torchrl/envs/batched_envs.py +3093 -0
torchrl/envs/common.py +4241 -0
torchrl/envs/custom/__init__.py +11 -0
torchrl/envs/custom/chess.py +617 -0
torchrl/envs/custom/llm.py +214 -0
torchrl/envs/custom/pendulum.py +401 -0
torchrl/envs/custom/san_moves.txt +29274 -0
torchrl/envs/custom/tictactoeenv.py +288 -0
torchrl/envs/env_creator.py +263 -0
torchrl/envs/gym_like.py +752 -0
torchrl/envs/libs/__init__.py +68 -0
torchrl/envs/libs/_gym_utils.py +326 -0
torchrl/envs/libs/brax.py +846 -0
torchrl/envs/libs/dm_control.py +544 -0
torchrl/envs/libs/envpool.py +447 -0
torchrl/envs/libs/gym.py +2239 -0
torchrl/envs/libs/habitat.py +138 -0
torchrl/envs/libs/isaac_lab.py +87 -0
torchrl/envs/libs/isaacgym.py +203 -0
torchrl/envs/libs/jax_utils.py +166 -0
torchrl/envs/libs/jumanji.py +963 -0
torchrl/envs/libs/meltingpot.py +599 -0
torchrl/envs/libs/openml.py +153 -0
torchrl/envs/libs/openspiel.py +652 -0
torchrl/envs/libs/pettingzoo.py +1042 -0
torchrl/envs/libs/procgen.py +351 -0
torchrl/envs/libs/robohive.py +429 -0
torchrl/envs/libs/smacv2.py +645 -0
torchrl/envs/libs/unity_mlagents.py +891 -0
torchrl/envs/libs/utils.py +147 -0
torchrl/envs/libs/vmas.py +813 -0
torchrl/envs/llm/__init__.py +63 -0
torchrl/envs/llm/chat.py +730 -0
torchrl/envs/llm/datasets/README.md +4 -0
torchrl/envs/llm/datasets/__init__.py +17 -0
torchrl/envs/llm/datasets/gsm8k.py +353 -0
torchrl/envs/llm/datasets/ifeval.py +274 -0
torchrl/envs/llm/envs.py +789 -0
torchrl/envs/llm/libs/README.md +3 -0
torchrl/envs/llm/libs/__init__.py +8 -0
torchrl/envs/llm/libs/mlgym.py +869 -0
torchrl/envs/llm/reward/__init__.py +10 -0
torchrl/envs/llm/reward/gsm8k.py +324 -0
torchrl/envs/llm/reward/ifeval/README.md +13 -0
torchrl/envs/llm/reward/ifeval/__init__.py +10 -0
torchrl/envs/llm/reward/ifeval/_instructions.py +1667 -0
torchrl/envs/llm/reward/ifeval/_instructions_main.py +131 -0
torchrl/envs/llm/reward/ifeval/_instructions_registry.py +100 -0
torchrl/envs/llm/reward/ifeval/_instructions_util.py +1677 -0
torchrl/envs/llm/reward/ifeval/_scorer.py +454 -0
torchrl/envs/llm/transforms/__init__.py +55 -0
torchrl/envs/llm/transforms/browser.py +292 -0
torchrl/envs/llm/transforms/dataloading.py +859 -0
torchrl/envs/llm/transforms/format.py +73 -0
torchrl/envs/llm/transforms/kl.py +1544 -0
torchrl/envs/llm/transforms/policy_version.py +189 -0
torchrl/envs/llm/transforms/reason.py +323 -0
torchrl/envs/llm/transforms/tokenizer.py +321 -0
torchrl/envs/llm/transforms/tools.py +1955 -0
torchrl/envs/model_based/__init__.py +9 -0
torchrl/envs/model_based/common.py +180 -0
torchrl/envs/model_based/dreamer.py +112 -0
torchrl/envs/transforms/__init__.py +147 -0
torchrl/envs/transforms/functional.py +48 -0
torchrl/envs/transforms/gym_transforms.py +203 -0
torchrl/envs/transforms/module.py +341 -0
torchrl/envs/transforms/r3m.py +372 -0
torchrl/envs/transforms/ray_service.py +663 -0
torchrl/envs/transforms/rb_transforms.py +214 -0
torchrl/envs/transforms/transforms.py +11835 -0
torchrl/envs/transforms/utils.py +94 -0
torchrl/envs/transforms/vc1.py +307 -0
torchrl/envs/transforms/vecnorm.py +845 -0
torchrl/envs/transforms/vip.py +407 -0
torchrl/envs/utils.py +1718 -0
torchrl/envs/vec_envs.py +11 -0
torchrl/modules/__init__.py +206 -0
torchrl/modules/distributions/__init__.py +73 -0
torchrl/modules/distributions/continuous.py +830 -0
torchrl/modules/distributions/discrete.py +908 -0
torchrl/modules/distributions/truncated_normal.py +187 -0
torchrl/modules/distributions/utils.py +233 -0
torchrl/modules/llm/__init__.py +62 -0
torchrl/modules/llm/backends/__init__.py +65 -0
torchrl/modules/llm/backends/vllm/__init__.py +94 -0
torchrl/modules/llm/backends/vllm/_models.py +46 -0
torchrl/modules/llm/backends/vllm/base.py +72 -0
torchrl/modules/llm/backends/vllm/vllm_async.py +2075 -0
torchrl/modules/llm/backends/vllm/vllm_plugin.py +22 -0
torchrl/modules/llm/backends/vllm/vllm_sync.py +446 -0
torchrl/modules/llm/backends/vllm/vllm_utils.py +129 -0
torchrl/modules/llm/policies/__init__.py +28 -0
torchrl/modules/llm/policies/common.py +1809 -0
torchrl/modules/llm/policies/transformers_wrapper.py +2756 -0
torchrl/modules/llm/policies/vllm_wrapper.py +2241 -0
torchrl/modules/llm/utils.py +23 -0
torchrl/modules/mcts/__init__.py +21 -0
torchrl/modules/mcts/scores.py +579 -0
torchrl/modules/models/__init__.py +86 -0
torchrl/modules/models/batchrenorm.py +119 -0
torchrl/modules/models/decision_transformer.py +179 -0
torchrl/modules/models/exploration.py +731 -0
torchrl/modules/models/llm.py +156 -0
torchrl/modules/models/model_based.py +596 -0
torchrl/modules/models/models.py +1712 -0
torchrl/modules/models/multiagent.py +1067 -0
torchrl/modules/models/recipes/impala.py +185 -0
torchrl/modules/models/utils.py +162 -0
torchrl/modules/planners/__init__.py +10 -0
torchrl/modules/planners/cem.py +228 -0
torchrl/modules/planners/common.py +73 -0
torchrl/modules/planners/mppi.py +265 -0
torchrl/modules/tensordict_module/__init__.py +89 -0
torchrl/modules/tensordict_module/actors.py +2457 -0
torchrl/modules/tensordict_module/common.py +529 -0
torchrl/modules/tensordict_module/exploration.py +814 -0
torchrl/modules/tensordict_module/probabilistic.py +321 -0
torchrl/modules/tensordict_module/rnn.py +1639 -0
torchrl/modules/tensordict_module/sequence.py +132 -0
torchrl/modules/tensordict_module/world_models.py +34 -0
torchrl/modules/utils/__init__.py +38 -0
torchrl/modules/utils/mappings.py +9 -0
torchrl/modules/utils/utils.py +89 -0
torchrl/objectives/__init__.py +78 -0
torchrl/objectives/a2c.py +659 -0
torchrl/objectives/common.py +753 -0
torchrl/objectives/cql.py +1346 -0
torchrl/objectives/crossq.py +710 -0
torchrl/objectives/ddpg.py +453 -0
torchrl/objectives/decision_transformer.py +371 -0
torchrl/objectives/deprecated.py +516 -0
torchrl/objectives/dqn.py +683 -0
torchrl/objectives/dreamer.py +488 -0
torchrl/objectives/functional.py +48 -0
torchrl/objectives/gail.py +258 -0
torchrl/objectives/iql.py +996 -0
torchrl/objectives/llm/__init__.py +30 -0
torchrl/objectives/llm/grpo.py +846 -0
torchrl/objectives/llm/sft.py +482 -0
torchrl/objectives/multiagent/__init__.py +8 -0
torchrl/objectives/multiagent/qmixer.py +396 -0
torchrl/objectives/ppo.py +1669 -0
torchrl/objectives/redq.py +683 -0
torchrl/objectives/reinforce.py +530 -0
torchrl/objectives/sac.py +1580 -0
torchrl/objectives/td3.py +570 -0
torchrl/objectives/td3_bc.py +625 -0
torchrl/objectives/utils.py +782 -0
torchrl/objectives/value/__init__.py +28 -0
torchrl/objectives/value/advantages.py +1956 -0
torchrl/objectives/value/functional.py +1459 -0
torchrl/objectives/value/utils.py +360 -0
torchrl/record/__init__.py +17 -0
torchrl/record/loggers/__init__.py +23 -0
torchrl/record/loggers/common.py +48 -0
torchrl/record/loggers/csv.py +226 -0
torchrl/record/loggers/mlflow.py +142 -0
torchrl/record/loggers/tensorboard.py +139 -0
torchrl/record/loggers/trackio.py +163 -0
torchrl/record/loggers/utils.py +78 -0
torchrl/record/loggers/wandb.py +214 -0
torchrl/record/recorder.py +554 -0
torchrl/services/__init__.py +79 -0
torchrl/services/base.py +109 -0
torchrl/services/ray_service.py +453 -0
torchrl/testing/__init__.py +107 -0
torchrl/testing/assertions.py +179 -0
torchrl/testing/dist_utils.py +122 -0
torchrl/testing/env_creators.py +227 -0
torchrl/testing/env_helper.py +35 -0
torchrl/testing/gym_helpers.py +156 -0
torchrl/testing/llm_mocks.py +119 -0
torchrl/testing/mocking_classes.py +2720 -0
torchrl/testing/modules.py +295 -0
torchrl/testing/mp_helpers.py +15 -0
torchrl/testing/ray_helpers.py +293 -0
torchrl/testing/utils.py +190 -0
torchrl/trainers/__init__.py +42 -0
torchrl/trainers/algorithms/__init__.py +11 -0
torchrl/trainers/algorithms/configs/__init__.py +705 -0
torchrl/trainers/algorithms/configs/collectors.py +216 -0
torchrl/trainers/algorithms/configs/common.py +41 -0
torchrl/trainers/algorithms/configs/data.py +308 -0
torchrl/trainers/algorithms/configs/envs.py +104 -0
torchrl/trainers/algorithms/configs/envs_libs.py +361 -0
torchrl/trainers/algorithms/configs/logging.py +80 -0
torchrl/trainers/algorithms/configs/modules.py +570 -0
torchrl/trainers/algorithms/configs/objectives.py +177 -0
torchrl/trainers/algorithms/configs/trainers.py +340 -0
torchrl/trainers/algorithms/configs/transforms.py +955 -0
torchrl/trainers/algorithms/configs/utils.py +252 -0
torchrl/trainers/algorithms/configs/weight_sync_schemes.py +191 -0
torchrl/trainers/algorithms/configs/weight_update.py +159 -0
torchrl/trainers/algorithms/ppo.py +373 -0
torchrl/trainers/algorithms/sac.py +308 -0
torchrl/trainers/helpers/__init__.py +40 -0
torchrl/trainers/helpers/collectors.py +416 -0
torchrl/trainers/helpers/envs.py +573 -0
torchrl/trainers/helpers/logger.py +33 -0
torchrl/trainers/helpers/losses.py +132 -0
torchrl/trainers/helpers/models.py +658 -0
torchrl/trainers/helpers/replay_buffer.py +59 -0
torchrl/trainers/helpers/trainers.py +301 -0
torchrl/trainers/trainers.py +2052 -0
torchrl/weight_update/__init__.py +33 -0
torchrl/weight_update/_distributed.py +749 -0
torchrl/weight_update/_mp.py +624 -0
torchrl/weight_update/_noupdate.py +102 -0
torchrl/weight_update/_ray.py +1032 -0
torchrl/weight_update/_rpc.py +284 -0
torchrl/weight_update/_shared.py +891 -0
torchrl/weight_update/llm/__init__.py +32 -0
torchrl/weight_update/llm/vllm_double_buffer.py +370 -0
torchrl/weight_update/llm/vllm_nccl.py +710 -0
torchrl/weight_update/utils.py +73 -0
torchrl/weight_update/weight_sync_schemes.py +1244 -0
torchrl-0.11.0.dist-info/METADATA +1308 -0
torchrl-0.11.0.dist-info/RECORD +395 -0
torchrl-0.11.0.dist-info/WHEEL +5 -0
torchrl-0.11.0.dist-info/entry_points.txt +2 -0
torchrl-0.11.0.dist-info/licenses/LICENSE +21 -0
torchrl-0.11.0.dist-info/top_level.txt +7 -0

sota-implementations/sac/sac-async.py ADDED Viewed

@@ -0,0 +1,266 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""Async SAC Example.
+WARNING: This isn't a SOTA implementation but a rudimentary implementation of SAC where inference
+and training are entirely decoupled. It can achieve a 20x speedup if compile and cudagraph are used.
+Two GPUs are required for this script to run.
+The API is currently being perfected, and contributions are welcome (as usual!) - see the TODOs in this script.
+This is a simple self-contained example of a SAC training script.
+It supports state environments like MuJoCo.
+The helper functions are coded in the utils.py associated with this script.
+"""
+from __future__ import annotations
+import time
+import warnings
+from functools import partial
+import hydra
+import numpy as np
+import tensordict
+import torch
+import torch.cuda
+import tqdm
+from tensordict import TensorDict
+from tensordict.nn import CudaGraphModule
+from torchrl._utils import (
+    compile_with_warmup,
+    get_available_device,
+    logger as torchrl_logger,
+    timeit,
+)
+from torchrl.envs.utils import ExplorationType, set_exploration_type
+from torchrl.objectives import group_optimizers
+from torchrl.record.loggers import generate_exp_name, get_logger
+from utils import (
+    dump_video,
+    log_metrics,
+    make_collector_async,
+    make_environment,
+    make_loss_module,
+    make_replay_buffer,
+    make_sac_agent,
+    make_sac_optimizer,
+    make_train_environment,
+)
+torch.set_float32_matmul_precision("high")
+tensordict.nn.functional_modules._exclude_td_from_pytree().set()
+@hydra.main(version_base="1.1", config_path="", config_name="config-async")
+def main(cfg: DictConfig):  # noqa: F821
+    device = (
+        torch.device(cfg.network.device)
+        if cfg.network.device
+        else get_available_device()
+    )
+    # Create logger
+    exp_name = generate_exp_name("SAC", cfg.logger.exp_name)
+    logger = None
+    if cfg.logger.backend:
+        logger = get_logger(
+            logger_type=cfg.logger.backend,
+            logger_name="async_sac_logging",
+            experiment_name=exp_name,
+            wandb_kwargs={
+                "mode": cfg.logger.mode,
+                "config": dict(cfg),
+                "project": cfg.logger.project_name,
+                "group": cfg.logger.group_name,
+            },
+        )
+    torch.manual_seed(cfg.env.seed)
+    np.random.seed(cfg.env.seed)
+    # Create environments
+    _, eval_env = make_environment(cfg, logger=logger)
+    # TODO: This should be simplified. We need to create the policy on cuda:1 directly because of the bounds
+    #  of the TanhDistribution which cannot be sent to cuda:1 within the distribution construction (ie, the
+    #  distribution kwargs need to have access to the low / high values on the right device for compile and
+    #  cudagraph to work).
+    # Create agent
+    dummy_train_env = make_train_environment(cfg)
+    model, _ = make_sac_agent(cfg, dummy_train_env, eval_env, device)
+    _, exploration_policy = make_sac_agent(cfg, dummy_train_env, eval_env, "cuda:1")
+    dummy_train_env.close(raise_if_closed=False)
+    del dummy_train_env
+    exploration_policy.load_state_dict(model[0].state_dict())
+    # Create SAC loss
+    loss_module, target_net_updater = make_loss_module(cfg, model)
+    compile_mode = None
+    if cfg.compile.compile:
+        compile_mode = cfg.compile.compile_mode
+        if compile_mode in ("", None):
+            if cfg.compile.cudagraphs:
+                compile_mode = "default"
+            else:
+                compile_mode = "reduce-overhead"
+        compile_mode_collector = compile_mode  # "reduce-overhead"
+    # TODO: enabling prefetch for mp RBs would speed up sampling which is currently responsible for
+    #  half of the compute time on the trainer side.
+    # Create replay buffer
+    replay_buffer = make_replay_buffer(
+        batch_size=cfg.optim.batch_size,
+        prb=cfg.replay_buffer.prb,
+        buffer_size=cfg.replay_buffer.size,
+        scratch_dir=cfg.replay_buffer.scratch_dir,
+        device=device,
+        shared=True,
+        prefetch=0,
+    )
+    # TODO: Simplify this - ideally we'd like to share the uninitialized lazy tensor storage and fetch it once
+    #  it's initialized
+    replay_buffer.extend(make_train_environment(cfg).rollout(1).view(-1))
+    replay_buffer.empty()
+    # Create off-policy collector and start it
+    collector = make_collector_async(
+        cfg,
+        partial(make_train_environment, cfg),
+        exploration_policy,
+        compile_mode=compile_mode_collector,
+        replay_buffer=replay_buffer,
+    )
+    # Create optimizers
+    (
+        optimizer_actor,
+        optimizer_critic,
+        optimizer_alpha,
+    ) = make_sac_optimizer(cfg, loss_module)
+    optimizer = group_optimizers(optimizer_actor, optimizer_critic, optimizer_alpha)
+    del optimizer_actor, optimizer_critic, optimizer_alpha
+    def update(sampled_tensordict):
+        # Compute loss
+        loss_td = loss_module(sampled_tensordict)
+        actor_loss = loss_td["loss_actor"]
+        q_loss = loss_td["loss_qvalue"]
+        alpha_loss = loss_td["loss_alpha"]
+        (actor_loss + q_loss + alpha_loss).sum().backward()
+        optimizer.step()
+        # Update qnet_target params
+        target_net_updater.step()
+        optimizer.zero_grad(set_to_none=True)
+        return loss_td.detach()
+    if cfg.compile.compile:
+        update = compile_with_warmup(update, mode=compile_mode, warmup=2)
+    cfg.compile.cudagraphs
+    if cfg.compile.cudagraphs:
+        warnings.warn(
+            "CudaGraphModule is experimental and may lead to silently wrong results. Use with caution.",
+            category=UserWarning,
+        )
+        update = CudaGraphModule(update, in_keys=[], out_keys=[], warmup=10)
+    # Main loop
+    init_random_frames = cfg.collector.init_random_frames
+    prb = cfg.replay_buffer.prb
+    update_freq = cfg.collector.update_freq
+    eval_rollout_steps = cfg.env.max_episode_steps
+    log_freq = cfg.logger.log_freq
+    # TODO: customize this
+    num_updates = 1000
+    total_iter = 1000
+    pbar = tqdm.tqdm(total=total_iter * num_updates)
+    params = TensorDict.from_module(model[0]).data
+    # Wait till we have enough data to start training
+    while replay_buffer.write_count <= init_random_frames:
+        time.sleep(0.01)
+    losses = []
+    for i in range(total_iter * num_updates):
+        timeit.printevery(
+            num_prints=total_iter * num_updates // log_freq,
+            total_count=total_iter * num_updates,
+            erase=True,
+        )
+        if (i % update_freq) == 0:
+            # Update weights of the inference policy
+            torchrl_logger.info("Updating weights")
+            collector.update_policy_weights_(params)
+        pbar.update(1)
+        # Optimization steps
+        with timeit("train"):
+            with timeit("train - rb - sample"):
+                # Sample from replay buffer
+                sampled_tensordict = replay_buffer.sample()
+            with timeit("train - update"):
+                torch.compiler.cudagraph_mark_step_begin()
+                loss_td = update(sampled_tensordict).clone()
+            losses.append(loss_td.select("loss_actor", "loss_qvalue", "loss_alpha"))
+            # Update priority
+            if prb:
+                replay_buffer.update_priority(sampled_tensordict)
+        # Logging
+        if (i % log_freq) == (log_freq - 1):
+            torchrl_logger.info("Logging")
+            collected_frames = replay_buffer.write_count
+            metrics_to_log = {}
+            if collected_frames >= init_random_frames:
+                losses_m = torch.stack(losses).mean()
+                losses = []
+                metrics_to_log["train/q_loss"] = losses_m.get("loss_qvalue")
+                metrics_to_log["train/actor_loss"] = losses_m.get("loss_actor")
+                metrics_to_log["train/alpha_loss"] = losses_m.get("loss_alpha")
+                metrics_to_log["train/alpha"] = loss_td["alpha"]
+                metrics_to_log["train/entropy"] = loss_td["entropy"]
+                metrics_to_log["train/collected_frames"] = int(collected_frames)
+            # Evaluation
+            with set_exploration_type(
+                ExplorationType.DETERMINISTIC
+            ), torch.no_grad(), timeit("eval"):
+                eval_rollout = eval_env.rollout(
+                    eval_rollout_steps,
+                    model[0],
+                    auto_cast_to_device=True,
+                    break_when_any_done=True,
+                )
+                eval_env.apply(dump_video)
+                eval_reward = eval_rollout["next", "reward"].sum(-2).mean().item()
+                metrics_to_log["eval/reward"] = eval_reward
+            torchrl_logger.info(f"Logs: {metrics_to_log}")
+            if logger is not None:
+                metrics_to_log.update(timeit.todict(prefix="time"))
+                metrics_to_log["time/speed"] = pbar.format_dict["rate"]
+                log_metrics(logger, metrics_to_log, collected_frames)
+    collector.shutdown()
+    if not eval_env.is_closed:
+        eval_env.close()
+if __name__ == "__main__":
+    main()

sota-implementations/sac/sac.py ADDED Viewed

@@ -0,0 +1,239 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""SAC Example.
+This is a simple self-contained example of a SAC training script.
+It supports state environments like MuJoCo.
+The helper functions are coded in the utils.py associated with this script.
+"""
+from __future__ import annotations
+import warnings
+import hydra
+import numpy as np
+import torch
+import torch.cuda
+import tqdm
+from tensordict import TensorDict
+from tensordict.nn import CudaGraphModule
+from torchrl._utils import compile_with_warmup, get_available_device, timeit
+from torchrl.envs.utils import ExplorationType, set_exploration_type
+from torchrl.objectives import group_optimizers
+from torchrl.record.loggers import generate_exp_name, get_logger
+from utils import (
+    dump_video,
+    log_metrics,
+    make_collector,
+    make_environment,
+    make_loss_module,
+    make_replay_buffer,
+    make_sac_agent,
+    make_sac_optimizer,
+)
+torch.set_float32_matmul_precision("high")
+@hydra.main(version_base="1.1", config_path="", config_name="config")
+def main(cfg: DictConfig):  # noqa: F821
+    device = (
+        torch.device(cfg.network.device)
+        if cfg.network.device
+        else get_available_device()
+    )
+    # Create logger
+    exp_name = generate_exp_name("SAC", cfg.logger.exp_name)
+    logger = None
+    if cfg.logger.backend:
+        logger = get_logger(
+            logger_type=cfg.logger.backend,
+            logger_name="sac_logging",
+            experiment_name=exp_name,
+            wandb_kwargs={
+                "mode": cfg.logger.mode,
+                "config": dict(cfg),
+                "project": cfg.logger.project_name,
+                "group": cfg.logger.group_name,
+            },
+        )
+    torch.manual_seed(cfg.env.seed)
+    np.random.seed(cfg.env.seed)
+    # Create environments
+    train_env, eval_env = make_environment(cfg, logger=logger)
+    # Create agent
+    model, exploration_policy = make_sac_agent(cfg, train_env, eval_env, device)
+    # Create SAC loss
+    loss_module, target_net_updater = make_loss_module(cfg, model)
+    compile_mode = None
+    if cfg.compile.compile:
+        compile_mode = cfg.compile.compile_mode
+        if compile_mode in ("", None):
+            if cfg.compile.cudagraphs:
+                compile_mode = "default"
+            else:
+                compile_mode = "reduce-overhead"
+    # Create off-policy collector
+    collector = make_collector(
+        cfg, train_env, exploration_policy, compile_mode=compile_mode
+    )
+    # Create replay buffer
+    replay_buffer = make_replay_buffer(
+        batch_size=cfg.optim.batch_size,
+        prb=cfg.replay_buffer.prb,
+        buffer_size=cfg.replay_buffer.size,
+        scratch_dir=cfg.replay_buffer.scratch_dir,
+        device=device,
+    )
+    # Create optimizers
+    (
+        optimizer_actor,
+        optimizer_critic,
+        optimizer_alpha,
+    ) = make_sac_optimizer(cfg, loss_module)
+    optimizer = group_optimizers(optimizer_actor, optimizer_critic, optimizer_alpha)
+    del optimizer_actor, optimizer_critic, optimizer_alpha
+    def update(sampled_tensordict):
+        # Compute loss
+        loss_td = loss_module(sampled_tensordict)
+        actor_loss = loss_td["loss_actor"]
+        q_loss = loss_td["loss_qvalue"]
+        alpha_loss = loss_td["loss_alpha"]
+        (actor_loss + q_loss + alpha_loss).sum().backward()
+        optimizer.step()
+        optimizer.zero_grad(set_to_none=True)
+        # Update qnet_target params
+        target_net_updater.step()
+        return loss_td.detach()
+    if cfg.compile.compile:
+        update = compile_with_warmup(update, mode=compile_mode, warmup=1)
+    if cfg.compile.cudagraphs:
+        warnings.warn(
+            "CudaGraphModule is experimental and may lead to silently wrong results. Use with caution.",
+            category=UserWarning,
+        )
+        update = CudaGraphModule(update, in_keys=[], out_keys=[], warmup=5)
+    # Main loop
+    collected_frames = 0
+    pbar = tqdm.tqdm(total=cfg.collector.total_frames)
+    init_random_frames = cfg.collector.init_random_frames
+    num_updates = int(cfg.collector.frames_per_batch * cfg.optim.utd_ratio)
+    prb = cfg.replay_buffer.prb
+    eval_iter = cfg.logger.eval_iter
+    frames_per_batch = cfg.collector.frames_per_batch
+    eval_rollout_steps = cfg.env.max_episode_steps
+    collector_iter = iter(collector)
+    total_iter = len(collector)
+    for i in range(total_iter):
+        timeit.printevery(num_prints=1000, total_count=total_iter, erase=True)
+        with timeit("collect"):
+            tensordict = next(collector_iter)
+        # Update weights of the inference policy
+        collector.update_policy_weights_()
+        current_frames = tensordict.numel()
+        pbar.update(current_frames)
+        with timeit("rb - extend"):
+            # Add to replay buffer
+            tensordict = tensordict.reshape(-1)
+            replay_buffer.extend(tensordict)
+        collected_frames += current_frames
+        # Optimization steps
+        with timeit("train"):
+            if collected_frames >= init_random_frames:
+                losses = TensorDict(batch_size=[num_updates])
+                for i in range(num_updates):
+                    with timeit("rb - sample"):
+                        # Sample from replay buffer
+                        sampled_tensordict = replay_buffer.sample()
+                    with timeit("update"):
+                        torch.compiler.cudagraph_mark_step_begin()
+                        loss_td = update(sampled_tensordict).clone()
+                    losses[i] = loss_td.select(
+                        "loss_actor", "loss_qvalue", "loss_alpha"
+                    )
+                    # Update priority
+                    if prb:
+                        replay_buffer.update_priority(sampled_tensordict)
+        episode_end = (
+            tensordict["next", "done"]
+            if tensordict["next", "done"].any()
+            else tensordict["next", "truncated"]
+        )
+        episode_rewards = tensordict["next", "episode_reward"][episode_end]
+        # Logging
+        metrics_to_log = {}
+        if len(episode_rewards) > 0:
+            episode_length = tensordict["next", "step_count"][episode_end]
+            metrics_to_log["train/reward"] = episode_rewards
+            metrics_to_log["train/episode_length"] = episode_length.sum() / len(
+                episode_length
+            )
+        if collected_frames >= init_random_frames:
+            losses = losses.mean()
+            metrics_to_log["train/q_loss"] = losses.get("loss_qvalue")
+            metrics_to_log["train/actor_loss"] = losses.get("loss_actor")
+            metrics_to_log["train/alpha_loss"] = losses.get("loss_alpha")
+            metrics_to_log["train/alpha"] = loss_td["alpha"]
+            metrics_to_log["train/entropy"] = loss_td["entropy"]
+        # Evaluation
+        if abs(collected_frames % eval_iter) < frames_per_batch:
+            with set_exploration_type(
+                ExplorationType.DETERMINISTIC
+            ), torch.no_grad(), timeit("eval"):
+                eval_rollout = eval_env.rollout(
+                    eval_rollout_steps,
+                    model[0],
+                    auto_cast_to_device=True,
+                    break_when_any_done=True,
+                )
+                eval_env.apply(dump_video)
+                eval_reward = eval_rollout["next", "reward"].sum(-2).mean().item()
+                metrics_to_log["eval/reward"] = eval_reward
+        if logger is not None:
+            metrics_to_log.update(timeit.todict(prefix="time"))
+            metrics_to_log["time/speed"] = pbar.format_dict["rate"]
+            log_metrics(logger, metrics_to_log, collected_frames)
+    collector.shutdown()
+    if not eval_env.is_closed:
+        eval_env.close()
+    if not train_env.is_closed:
+        train_env.close()
+if __name__ == "__main__":
+    main()