PyPI - torchrl - Versions diffs - 0.11.0__cp314-cp314t-win_amd64.whl - Mend

torchrl 0.11.0__cp314-cp314t-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (394) hide show

benchmarks/benchmark_batched_envs.py +104 -0
benchmarks/conftest.py +91 -0
benchmarks/ecosystem/gym_env_throughput.py +321 -0
benchmarks/ecosystem/vmas_rllib_vs_torchrl_sampling_performance.py +231 -0
benchmarks/requirements.txt +7 -0
benchmarks/storage/benchmark_sample_latency_over_rpc.py +193 -0
benchmarks/test_collectors_benchmark.py +240 -0
benchmarks/test_compressed_storage_benchmark.py +145 -0
benchmarks/test_envs_benchmark.py +133 -0
benchmarks/test_llm.py +101 -0
benchmarks/test_non_tensor_env_benchmark.py +70 -0
benchmarks/test_objectives_benchmarks.py +1199 -0
benchmarks/test_replaybuffer_benchmark.py +254 -0
sota-check/README.md +35 -0
sota-implementations/README.md +142 -0
sota-implementations/a2c/README.md +39 -0
sota-implementations/a2c/a2c_atari.py +291 -0
sota-implementations/a2c/a2c_mujoco.py +273 -0
sota-implementations/a2c/utils_atari.py +240 -0
sota-implementations/a2c/utils_mujoco.py +160 -0
sota-implementations/bandits/README.md +7 -0
sota-implementations/bandits/dqn.py +126 -0
sota-implementations/cql/cql_offline.py +198 -0
sota-implementations/cql/cql_online.py +249 -0
sota-implementations/cql/discrete_cql_offline.py +180 -0
sota-implementations/cql/discrete_cql_online.py +227 -0
sota-implementations/cql/utils.py +471 -0
sota-implementations/crossq/crossq.py +271 -0
sota-implementations/crossq/utils.py +320 -0
sota-implementations/ddpg/ddpg.py +231 -0
sota-implementations/ddpg/utils.py +325 -0
sota-implementations/decision_transformer/dt.py +163 -0
sota-implementations/decision_transformer/lamb.py +167 -0
sota-implementations/decision_transformer/online_dt.py +178 -0
sota-implementations/decision_transformer/utils.py +562 -0
sota-implementations/discrete_sac/discrete_sac.py +243 -0
sota-implementations/discrete_sac/utils.py +324 -0
sota-implementations/dqn/README.md +30 -0
sota-implementations/dqn/dqn_atari.py +272 -0
sota-implementations/dqn/dqn_cartpole.py +236 -0
sota-implementations/dqn/utils_atari.py +132 -0
sota-implementations/dqn/utils_cartpole.py +90 -0
sota-implementations/dreamer/README.md +129 -0
sota-implementations/dreamer/dreamer.py +586 -0
sota-implementations/dreamer/dreamer_utils.py +1107 -0
sota-implementations/expert-iteration/README.md +352 -0
sota-implementations/expert-iteration/ei_utils.py +770 -0
sota-implementations/expert-iteration/expert-iteration-async.py +512 -0
sota-implementations/expert-iteration/expert-iteration-sync.py +508 -0
sota-implementations/expert-iteration/requirements_gsm8k.txt +13 -0
sota-implementations/expert-iteration/requirements_ifeval.txt +16 -0
sota-implementations/gail/gail.py +327 -0
sota-implementations/gail/gail_utils.py +68 -0
sota-implementations/gail/ppo_utils.py +157 -0
sota-implementations/grpo/README.md +273 -0
sota-implementations/grpo/grpo-async.py +437 -0
sota-implementations/grpo/grpo-sync.py +435 -0
sota-implementations/grpo/grpo_utils.py +843 -0
sota-implementations/grpo/requirements_gsm8k.txt +11 -0
sota-implementations/grpo/requirements_ifeval.txt +16 -0
sota-implementations/impala/README.md +33 -0
sota-implementations/impala/impala_multi_node_ray.py +292 -0
sota-implementations/impala/impala_multi_node_submitit.py +284 -0
sota-implementations/impala/impala_single_node.py +261 -0
sota-implementations/impala/utils.py +184 -0
sota-implementations/iql/discrete_iql.py +230 -0
sota-implementations/iql/iql_offline.py +164 -0
sota-implementations/iql/iql_online.py +225 -0
sota-implementations/iql/utils.py +437 -0
sota-implementations/multiagent/README.md +74 -0
sota-implementations/multiagent/iql.py +237 -0
sota-implementations/multiagent/maddpg_iddpg.py +266 -0
sota-implementations/multiagent/mappo_ippo.py +267 -0
sota-implementations/multiagent/qmix_vdn.py +271 -0
sota-implementations/multiagent/sac.py +337 -0
sota-implementations/multiagent/utils/__init__.py +4 -0
sota-implementations/multiagent/utils/logging.py +151 -0
sota-implementations/multiagent/utils/utils.py +43 -0
sota-implementations/ppo/README.md +29 -0
sota-implementations/ppo/ppo_atari.py +305 -0
sota-implementations/ppo/ppo_mujoco.py +293 -0
sota-implementations/ppo/utils_atari.py +238 -0
sota-implementations/ppo/utils_mujoco.py +152 -0
sota-implementations/ppo_trainer/train.py +21 -0
sota-implementations/redq/README.md +7 -0
sota-implementations/redq/redq.py +199 -0
sota-implementations/redq/utils.py +1060 -0
sota-implementations/sac/sac-async.py +266 -0
sota-implementations/sac/sac.py +239 -0
sota-implementations/sac/utils.py +381 -0
sota-implementations/sac_trainer/train.py +16 -0
sota-implementations/td3/td3.py +254 -0
sota-implementations/td3/utils.py +319 -0
sota-implementations/td3_bc/td3_bc.py +177 -0
sota-implementations/td3_bc/utils.py +251 -0
torchrl/__init__.py +144 -0
torchrl/_extension.py +74 -0
torchrl/_torchrl.cp314t-win_amd64.pyd +0 -0
torchrl/_utils.py +1431 -0
torchrl/collectors/__init__.py +48 -0
torchrl/collectors/_base.py +1058 -0
torchrl/collectors/_constants.py +88 -0
torchrl/collectors/_multi_async.py +324 -0
torchrl/collectors/_multi_base.py +1805 -0
torchrl/collectors/_multi_sync.py +464 -0
torchrl/collectors/_runner.py +581 -0
torchrl/collectors/_single.py +2009 -0
torchrl/collectors/_single_async.py +259 -0
torchrl/collectors/collectors.py +62 -0
torchrl/collectors/distributed/__init__.py +32 -0
torchrl/collectors/distributed/default_configs.py +133 -0
torchrl/collectors/distributed/generic.py +1306 -0
torchrl/collectors/distributed/ray.py +1092 -0
torchrl/collectors/distributed/rpc.py +1006 -0
torchrl/collectors/distributed/sync.py +731 -0
torchrl/collectors/distributed/utils.py +160 -0
torchrl/collectors/llm/__init__.py +10 -0
torchrl/collectors/llm/base.py +494 -0
torchrl/collectors/llm/ray_collector.py +275 -0
torchrl/collectors/llm/utils.py +36 -0
torchrl/collectors/llm/weight_update/__init__.py +10 -0
torchrl/collectors/llm/weight_update/vllm.py +348 -0
torchrl/collectors/llm/weight_update/vllm_v2.py +311 -0
torchrl/collectors/utils.py +433 -0
torchrl/collectors/weight_update.py +591 -0
torchrl/csrc/numpy_utils.h +38 -0
torchrl/csrc/pybind.cpp +27 -0
torchrl/csrc/segment_tree.h +458 -0
torchrl/csrc/torch_utils.h +34 -0
torchrl/csrc/utils.cpp +48 -0
torchrl/csrc/utils.h +31 -0
torchrl/data/__init__.py +187 -0
torchrl/data/datasets/__init__.py +58 -0
torchrl/data/datasets/atari_dqn.py +878 -0
torchrl/data/datasets/common.py +281 -0
torchrl/data/datasets/d4rl.py +489 -0
torchrl/data/datasets/d4rl_infos.py +187 -0
torchrl/data/datasets/gen_dgrl.py +375 -0
torchrl/data/datasets/minari_data.py +643 -0
torchrl/data/datasets/openml.py +177 -0
torchrl/data/datasets/openx.py +798 -0
torchrl/data/datasets/roboset.py +363 -0
torchrl/data/datasets/utils.py +11 -0
torchrl/data/datasets/vd4rl.py +432 -0
torchrl/data/llm/__init__.py +34 -0
torchrl/data/llm/dataset.py +491 -0
torchrl/data/llm/history.py +1378 -0
torchrl/data/llm/prompt.py +198 -0
torchrl/data/llm/reward.py +225 -0
torchrl/data/llm/topk.py +186 -0
torchrl/data/llm/utils.py +543 -0
torchrl/data/map/__init__.py +21 -0
torchrl/data/map/hash.py +185 -0
torchrl/data/map/query.py +204 -0
torchrl/data/map/tdstorage.py +363 -0
torchrl/data/map/tree.py +1434 -0
torchrl/data/map/utils.py +103 -0
torchrl/data/postprocs/__init__.py +8 -0
torchrl/data/postprocs/postprocs.py +391 -0
torchrl/data/replay_buffers/__init__.py +99 -0
torchrl/data/replay_buffers/checkpointers.py +622 -0
torchrl/data/replay_buffers/ray_buffer.py +292 -0
torchrl/data/replay_buffers/replay_buffers.py +2376 -0
torchrl/data/replay_buffers/samplers.py +2578 -0
torchrl/data/replay_buffers/scheduler.py +265 -0
torchrl/data/replay_buffers/storages.py +2412 -0
torchrl/data/replay_buffers/utils.py +1042 -0
torchrl/data/replay_buffers/writers.py +781 -0
torchrl/data/tensor_specs.py +7101 -0
torchrl/data/utils.py +334 -0
torchrl/envs/__init__.py +265 -0
torchrl/envs/async_envs.py +1105 -0
torchrl/envs/batched_envs.py +3093 -0
torchrl/envs/common.py +4241 -0
torchrl/envs/custom/__init__.py +11 -0
torchrl/envs/custom/chess.py +617 -0
torchrl/envs/custom/llm.py +214 -0
torchrl/envs/custom/pendulum.py +401 -0
torchrl/envs/custom/san_moves.txt +29274 -0
torchrl/envs/custom/tictactoeenv.py +288 -0
torchrl/envs/env_creator.py +263 -0
torchrl/envs/gym_like.py +752 -0
torchrl/envs/libs/__init__.py +68 -0
torchrl/envs/libs/_gym_utils.py +326 -0
torchrl/envs/libs/brax.py +846 -0
torchrl/envs/libs/dm_control.py +544 -0
torchrl/envs/libs/envpool.py +447 -0
torchrl/envs/libs/gym.py +2239 -0
torchrl/envs/libs/habitat.py +138 -0
torchrl/envs/libs/isaac_lab.py +87 -0
torchrl/envs/libs/isaacgym.py +203 -0
torchrl/envs/libs/jax_utils.py +166 -0
torchrl/envs/libs/jumanji.py +963 -0
torchrl/envs/libs/meltingpot.py +599 -0
torchrl/envs/libs/openml.py +153 -0
torchrl/envs/libs/openspiel.py +652 -0
torchrl/envs/libs/pettingzoo.py +1042 -0
torchrl/envs/libs/procgen.py +351 -0
torchrl/envs/libs/robohive.py +429 -0
torchrl/envs/libs/smacv2.py +645 -0
torchrl/envs/libs/unity_mlagents.py +891 -0
torchrl/envs/libs/utils.py +147 -0
torchrl/envs/libs/vmas.py +813 -0
torchrl/envs/llm/__init__.py +63 -0
torchrl/envs/llm/chat.py +730 -0
torchrl/envs/llm/datasets/README.md +4 -0
torchrl/envs/llm/datasets/__init__.py +17 -0
torchrl/envs/llm/datasets/gsm8k.py +353 -0
torchrl/envs/llm/datasets/ifeval.py +274 -0
torchrl/envs/llm/envs.py +789 -0
torchrl/envs/llm/libs/README.md +3 -0
torchrl/envs/llm/libs/__init__.py +8 -0
torchrl/envs/llm/libs/mlgym.py +869 -0
torchrl/envs/llm/reward/__init__.py +10 -0
torchrl/envs/llm/reward/gsm8k.py +324 -0
torchrl/envs/llm/reward/ifeval/README.md +13 -0
torchrl/envs/llm/reward/ifeval/__init__.py +10 -0
torchrl/envs/llm/reward/ifeval/_instructions.py +1667 -0
torchrl/envs/llm/reward/ifeval/_instructions_main.py +131 -0
torchrl/envs/llm/reward/ifeval/_instructions_registry.py +100 -0
torchrl/envs/llm/reward/ifeval/_instructions_util.py +1677 -0
torchrl/envs/llm/reward/ifeval/_scorer.py +454 -0
torchrl/envs/llm/transforms/__init__.py +55 -0
torchrl/envs/llm/transforms/browser.py +292 -0
torchrl/envs/llm/transforms/dataloading.py +859 -0
torchrl/envs/llm/transforms/format.py +73 -0
torchrl/envs/llm/transforms/kl.py +1544 -0
torchrl/envs/llm/transforms/policy_version.py +189 -0
torchrl/envs/llm/transforms/reason.py +323 -0
torchrl/envs/llm/transforms/tokenizer.py +321 -0
torchrl/envs/llm/transforms/tools.py +1955 -0
torchrl/envs/model_based/__init__.py +9 -0
torchrl/envs/model_based/common.py +180 -0
torchrl/envs/model_based/dreamer.py +112 -0
torchrl/envs/transforms/__init__.py +147 -0
torchrl/envs/transforms/functional.py +48 -0
torchrl/envs/transforms/gym_transforms.py +203 -0
torchrl/envs/transforms/module.py +341 -0
torchrl/envs/transforms/r3m.py +372 -0
torchrl/envs/transforms/ray_service.py +663 -0
torchrl/envs/transforms/rb_transforms.py +214 -0
torchrl/envs/transforms/transforms.py +11835 -0
torchrl/envs/transforms/utils.py +94 -0
torchrl/envs/transforms/vc1.py +307 -0
torchrl/envs/transforms/vecnorm.py +845 -0
torchrl/envs/transforms/vip.py +407 -0
torchrl/envs/utils.py +1718 -0
torchrl/envs/vec_envs.py +11 -0
torchrl/modules/__init__.py +206 -0
torchrl/modules/distributions/__init__.py +73 -0
torchrl/modules/distributions/continuous.py +830 -0
torchrl/modules/distributions/discrete.py +908 -0
torchrl/modules/distributions/truncated_normal.py +187 -0
torchrl/modules/distributions/utils.py +233 -0
torchrl/modules/llm/__init__.py +62 -0
torchrl/modules/llm/backends/__init__.py +65 -0
torchrl/modules/llm/backends/vllm/__init__.py +94 -0
torchrl/modules/llm/backends/vllm/_models.py +46 -0
torchrl/modules/llm/backends/vllm/base.py +72 -0
torchrl/modules/llm/backends/vllm/vllm_async.py +2075 -0
torchrl/modules/llm/backends/vllm/vllm_plugin.py +22 -0
torchrl/modules/llm/backends/vllm/vllm_sync.py +446 -0
torchrl/modules/llm/backends/vllm/vllm_utils.py +129 -0
torchrl/modules/llm/policies/__init__.py +28 -0
torchrl/modules/llm/policies/common.py +1809 -0
torchrl/modules/llm/policies/transformers_wrapper.py +2756 -0
torchrl/modules/llm/policies/vllm_wrapper.py +2241 -0
torchrl/modules/llm/utils.py +23 -0
torchrl/modules/mcts/__init__.py +21 -0
torchrl/modules/mcts/scores.py +579 -0
torchrl/modules/models/__init__.py +86 -0
torchrl/modules/models/batchrenorm.py +119 -0
torchrl/modules/models/decision_transformer.py +179 -0
torchrl/modules/models/exploration.py +731 -0
torchrl/modules/models/llm.py +156 -0
torchrl/modules/models/model_based.py +596 -0
torchrl/modules/models/models.py +1712 -0
torchrl/modules/models/multiagent.py +1067 -0
torchrl/modules/models/recipes/impala.py +185 -0
torchrl/modules/models/utils.py +162 -0
torchrl/modules/planners/__init__.py +10 -0
torchrl/modules/planners/cem.py +228 -0
torchrl/modules/planners/common.py +73 -0
torchrl/modules/planners/mppi.py +265 -0
torchrl/modules/tensordict_module/__init__.py +89 -0
torchrl/modules/tensordict_module/actors.py +2457 -0
torchrl/modules/tensordict_module/common.py +529 -0
torchrl/modules/tensordict_module/exploration.py +814 -0
torchrl/modules/tensordict_module/probabilistic.py +321 -0
torchrl/modules/tensordict_module/rnn.py +1639 -0
torchrl/modules/tensordict_module/sequence.py +132 -0
torchrl/modules/tensordict_module/world_models.py +34 -0
torchrl/modules/utils/__init__.py +38 -0
torchrl/modules/utils/mappings.py +9 -0
torchrl/modules/utils/utils.py +89 -0
torchrl/objectives/__init__.py +78 -0
torchrl/objectives/a2c.py +659 -0
torchrl/objectives/common.py +753 -0
torchrl/objectives/cql.py +1346 -0
torchrl/objectives/crossq.py +710 -0
torchrl/objectives/ddpg.py +453 -0
torchrl/objectives/decision_transformer.py +371 -0
torchrl/objectives/deprecated.py +516 -0
torchrl/objectives/dqn.py +683 -0
torchrl/objectives/dreamer.py +488 -0
torchrl/objectives/functional.py +48 -0
torchrl/objectives/gail.py +258 -0
torchrl/objectives/iql.py +996 -0
torchrl/objectives/llm/__init__.py +30 -0
torchrl/objectives/llm/grpo.py +846 -0
torchrl/objectives/llm/sft.py +482 -0
torchrl/objectives/multiagent/__init__.py +8 -0
torchrl/objectives/multiagent/qmixer.py +396 -0
torchrl/objectives/ppo.py +1669 -0
torchrl/objectives/redq.py +683 -0
torchrl/objectives/reinforce.py +530 -0
torchrl/objectives/sac.py +1580 -0
torchrl/objectives/td3.py +570 -0
torchrl/objectives/td3_bc.py +625 -0
torchrl/objectives/utils.py +782 -0
torchrl/objectives/value/__init__.py +28 -0
torchrl/objectives/value/advantages.py +1956 -0
torchrl/objectives/value/functional.py +1459 -0
torchrl/objectives/value/utils.py +360 -0
torchrl/record/__init__.py +17 -0
torchrl/record/loggers/__init__.py +23 -0
torchrl/record/loggers/common.py +48 -0
torchrl/record/loggers/csv.py +226 -0
torchrl/record/loggers/mlflow.py +142 -0
torchrl/record/loggers/tensorboard.py +139 -0
torchrl/record/loggers/trackio.py +163 -0
torchrl/record/loggers/utils.py +78 -0
torchrl/record/loggers/wandb.py +214 -0
torchrl/record/recorder.py +554 -0
torchrl/services/__init__.py +79 -0
torchrl/services/base.py +109 -0
torchrl/services/ray_service.py +453 -0
torchrl/testing/__init__.py +107 -0
torchrl/testing/assertions.py +179 -0
torchrl/testing/dist_utils.py +122 -0
torchrl/testing/env_creators.py +227 -0
torchrl/testing/env_helper.py +35 -0
torchrl/testing/gym_helpers.py +156 -0
torchrl/testing/llm_mocks.py +119 -0
torchrl/testing/mocking_classes.py +2720 -0
torchrl/testing/modules.py +295 -0
torchrl/testing/mp_helpers.py +15 -0
torchrl/testing/ray_helpers.py +293 -0
torchrl/testing/utils.py +190 -0
torchrl/trainers/__init__.py +42 -0
torchrl/trainers/algorithms/__init__.py +11 -0
torchrl/trainers/algorithms/configs/__init__.py +705 -0
torchrl/trainers/algorithms/configs/collectors.py +216 -0
torchrl/trainers/algorithms/configs/common.py +41 -0
torchrl/trainers/algorithms/configs/data.py +308 -0
torchrl/trainers/algorithms/configs/envs.py +104 -0
torchrl/trainers/algorithms/configs/envs_libs.py +361 -0
torchrl/trainers/algorithms/configs/logging.py +80 -0
torchrl/trainers/algorithms/configs/modules.py +570 -0
torchrl/trainers/algorithms/configs/objectives.py +177 -0
torchrl/trainers/algorithms/configs/trainers.py +340 -0
torchrl/trainers/algorithms/configs/transforms.py +955 -0
torchrl/trainers/algorithms/configs/utils.py +252 -0
torchrl/trainers/algorithms/configs/weight_sync_schemes.py +191 -0
torchrl/trainers/algorithms/configs/weight_update.py +159 -0
torchrl/trainers/algorithms/ppo.py +373 -0
torchrl/trainers/algorithms/sac.py +308 -0
torchrl/trainers/helpers/__init__.py +40 -0
torchrl/trainers/helpers/collectors.py +416 -0
torchrl/trainers/helpers/envs.py +573 -0
torchrl/trainers/helpers/logger.py +33 -0
torchrl/trainers/helpers/losses.py +132 -0
torchrl/trainers/helpers/models.py +658 -0
torchrl/trainers/helpers/replay_buffer.py +59 -0
torchrl/trainers/helpers/trainers.py +301 -0
torchrl/trainers/trainers.py +2052 -0
torchrl/weight_update/__init__.py +33 -0
torchrl/weight_update/_distributed.py +749 -0
torchrl/weight_update/_mp.py +624 -0
torchrl/weight_update/_noupdate.py +102 -0
torchrl/weight_update/_ray.py +1032 -0
torchrl/weight_update/_rpc.py +284 -0
torchrl/weight_update/_shared.py +891 -0
torchrl/weight_update/llm/__init__.py +32 -0
torchrl/weight_update/llm/vllm_double_buffer.py +370 -0
torchrl/weight_update/llm/vllm_nccl.py +710 -0
torchrl/weight_update/utils.py +73 -0
torchrl/weight_update/weight_sync_schemes.py +1244 -0
torchrl-0.11.0.dist-info/LICENSE +21 -0
torchrl-0.11.0.dist-info/METADATA +1307 -0
torchrl-0.11.0.dist-info/RECORD +394 -0
torchrl-0.11.0.dist-info/WHEEL +5 -0
torchrl-0.11.0.dist-info/entry_points.txt +2 -0
torchrl-0.11.0.dist-info/top_level.txt +7 -0

sota-implementations/multiagent/sac.py ADDED Viewed

@@ -0,0 +1,337 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+import time
+import hydra
+import torch
+from tensordict.nn import TensorDictModule
+from tensordict.nn.distributions import NormalParamExtractor
+from torch import nn
+from torch.distributions import Categorical, OneHotCategorical
+from torchrl._utils import logger as torchrl_logger
+from torchrl.collectors import SyncDataCollector
+from torchrl.data import TensorDictReplayBuffer
+from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
+from torchrl.data.replay_buffers.storages import LazyTensorStorage
+from torchrl.envs import RewardSum, TransformedEnv
+from torchrl.envs.libs.vmas import VmasEnv
+from torchrl.envs.utils import ExplorationType, set_exploration_type
+from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator
+from torchrl.modules.models.multiagent import MultiAgentMLP
+from torchrl.objectives import DiscreteSACLoss, SACLoss, SoftUpdate, ValueEstimators
+from utils.logging import init_logging, log_evaluation, log_training
+from utils.utils import DoneTransform
+def rendering_callback(env, td):
+    env.frames.append(env.render(mode="rgb_array", agent_index_focus=None))
+@hydra.main(version_base="1.1", config_path="", config_name="sac")
+def train(cfg: DictConfig):  # noqa: F821
+    # Device
+    cfg.train.device = "cpu" if not torch.cuda.device_count() else "cuda:0"
+    cfg.env.device = cfg.train.device
+    # Seeding
+    torch.manual_seed(cfg.seed)
+    # Sampling
+    cfg.env.vmas_envs = cfg.collector.frames_per_batch // cfg.env.max_steps
+    cfg.collector.total_frames = cfg.collector.frames_per_batch * cfg.collector.n_iters
+    cfg.buffer.memory_size = cfg.collector.frames_per_batch
+    # Create env and env_test
+    env = VmasEnv(
+        scenario=cfg.env.scenario_name,
+        num_envs=cfg.env.vmas_envs,
+        continuous_actions=cfg.env.continuous_actions,
+        max_steps=cfg.env.max_steps,
+        device=cfg.env.device,
+        seed=cfg.seed,
+        categorical_actions=cfg.env.categorical_actions,
+        # Scenario kwargs
+        **cfg.env.scenario,
+    )
+    env = TransformedEnv(
+        env,
+        RewardSum(in_keys=[env.reward_key], out_keys=[("agents", "episode_reward")]),
+    )
+    env_test = VmasEnv(
+        scenario=cfg.env.scenario_name,
+        num_envs=cfg.eval.evaluation_episodes,
+        continuous_actions=cfg.env.continuous_actions,
+        max_steps=cfg.env.max_steps,
+        device=cfg.env.device,
+        seed=cfg.seed,
+        # Scenario kwargs
+        **cfg.env.scenario,
+    )
+    # Policy
+    if cfg.env.continuous_actions:
+        actor_net = nn.Sequential(
+            MultiAgentMLP(
+                n_agent_inputs=env.full_observation_spec_unbatched[
+                    "agents", "observation"
+                ].shape[-1],
+                n_agent_outputs=2
+                * env.full_action_spec_unbatched["agents", "action"].shape[-1],
+                n_agents=env.n_agents,
+                centralised=False,
+                share_params=cfg.model.shared_parameters,
+                device=cfg.train.device,
+                depth=2,
+                num_cells=256,
+                activation_class=nn.Tanh,
+            ),
+            NormalParamExtractor(),
+        )
+        policy_module = TensorDictModule(
+            actor_net,
+            in_keys=[("agents", "observation")],
+            out_keys=[("agents", "loc"), ("agents", "scale")],
+        )
+        policy = ProbabilisticActor(
+            module=policy_module,
+            spec=env.full_action_spec_unbatched,
+            in_keys=[("agents", "loc"), ("agents", "scale")],
+            out_keys=[env.action_key],
+            distribution_class=TanhNormal,
+            distribution_kwargs={
+                "low": env.full_action_spec_unbatched[("agents", "action")].space.low,
+                "high": env.full_action_spec_unbatched[("agents", "action")].space.high,
+            },
+            return_log_prob=True,
+        )
+        # Critic
+        module = MultiAgentMLP(
+            n_agent_inputs=env.observation_spec["agents", "observation"].shape[-1]
+            + env.full_action_spec_unbatched["agents", "action"].shape[
+                -1
+            ],  # Q critic takes action and value
+            n_agent_outputs=1,
+            n_agents=env.n_agents,
+            centralised=cfg.model.centralised_critic,
+            share_params=cfg.model.shared_parameters,
+            device=cfg.train.device,
+            depth=2,
+            num_cells=256,
+            activation_class=nn.Tanh,
+        )
+        value_module = ValueOperator(
+            module=module,
+            in_keys=[("agents", "observation"), env.action_key],
+            out_keys=[("agents", "state_action_value")],
+        )
+    else:
+        actor_net = nn.Sequential(
+            MultiAgentMLP(
+                n_agent_inputs=env.observation_spec["agents", "observation"].shape[-1],
+                n_agent_outputs=env.full_action_spec_unbatched[
+                    "agents", "action"
+                ].space.n,
+                n_agents=env.n_agents,
+                centralised=False,
+                share_params=cfg.model.shared_parameters,
+                device=cfg.train.device,
+                depth=2,
+                num_cells=256,
+                activation_class=nn.Tanh,
+            ),
+        )
+        policy_module = TensorDictModule(
+            actor_net,
+            in_keys=[("agents", "observation")],
+            out_keys=[("agents", "logits")],
+        )
+        policy = ProbabilisticActor(
+            module=policy_module,
+            spec=env.full_action_spec_unbatched["agents", "action"],
+            in_keys=[("agents", "logits")],
+            out_keys=[env.action_key],
+            distribution_class=OneHotCategorical
+            if not cfg.env.categorical_actions
+            else Categorical,
+            return_log_prob=True,
+        )
+        # Critic
+        module = MultiAgentMLP(
+            n_agent_inputs=env.observation_spec["agents", "observation"].shape[-1],
+            n_agent_outputs=env.full_action_spec_unbatched["agents", "action"].space.n,
+            n_agents=env.n_agents,
+            centralised=cfg.model.centralised_critic,
+            share_params=cfg.model.shared_parameters,
+            device=cfg.train.device,
+            depth=2,
+            num_cells=256,
+            activation_class=nn.Tanh,
+        )
+        value_module = ValueOperator(
+            module=module,
+            in_keys=[("agents", "observation")],
+            out_keys=[("agents", "action_value")],
+        )
+    collector = SyncDataCollector(
+        env,
+        policy,
+        device=cfg.env.device,
+        storing_device=cfg.train.device,
+        frames_per_batch=cfg.collector.frames_per_batch,
+        total_frames=cfg.collector.total_frames,
+        postproc=DoneTransform(reward_key=env.reward_key, done_keys=env.done_keys),
+    )
+    replay_buffer = TensorDictReplayBuffer(
+        storage=LazyTensorStorage(cfg.buffer.memory_size, device=cfg.train.device),
+        sampler=SamplerWithoutReplacement(),
+        batch_size=cfg.train.minibatch_size,
+    )
+    if cfg.env.continuous_actions:
+        loss_module = SACLoss(
+            actor_network=policy,
+            qvalue_network=value_module,
+            delay_qvalue=True,
+            action_spec=env.full_action_spec_unbatched,
+        )
+        loss_module.set_keys(
+            state_action_value=("agents", "state_action_value"),
+            action=env.action_key,
+            reward=env.reward_key,
+            done=("agents", "done"),
+            terminated=("agents", "terminated"),
+        )
+    else:
+        loss_module = DiscreteSACLoss(
+            actor_network=policy,
+            qvalue_network=value_module,
+            delay_qvalue=True,
+            num_actions=env.full_action_spec_unbatched["agents", "action"].space.n,
+            action_space=env.full_action_spec_unbatched,
+        )
+        loss_module.set_keys(
+            action_value=("agents", "action_value"),
+            action=env.action_key,
+            reward=env.reward_key,
+            done=("agents", "done"),
+            terminated=("agents", "terminated"),
+        )
+    loss_module.make_value_estimator(ValueEstimators.TD0, gamma=cfg.loss.gamma)
+    target_net_updater = SoftUpdate(loss_module, eps=1 - cfg.loss.tau)
+    optim = torch.optim.Adam(loss_module.parameters(), cfg.train.lr)
+    # Logging
+    if cfg.logger.backend:
+        model_name = (
+            ("Het" if not cfg.model.shared_parameters else "")
+            + ("MA" if cfg.model.centralised_critic else "I")
+            + "SAC"
+        )
+        logger = init_logging(cfg, model_name)
+    total_time = 0
+    total_frames = 0
+    sampling_start = time.time()
+    for i, tensordict_data in enumerate(collector):
+        torchrl_logger.info(f"\nIteration {i}")
+        sampling_time = time.time() - sampling_start
+        current_frames = tensordict_data.numel()
+        total_frames += current_frames
+        data_view = tensordict_data.reshape(-1)
+        replay_buffer.extend(data_view)
+        training_tds = []
+        training_start = time.time()
+        for _ in range(cfg.train.num_epochs):
+            for _ in range(cfg.collector.frames_per_batch // cfg.train.minibatch_size):
+                subdata = replay_buffer.sample()
+                loss_vals = loss_module(subdata)
+                training_tds.append(loss_vals.detach())
+                loss_value = (
+                    loss_vals["loss_actor"]
+                    + loss_vals["loss_alpha"]
+                    + loss_vals["loss_qvalue"]
+                )
+                loss_value.backward()
+                total_norm = torch.nn.utils.clip_grad_norm_(
+                    loss_module.parameters(), cfg.train.max_grad_norm
+                )
+                training_tds[-1].set("grad_norm", total_norm.mean())
+                optim.step()
+                optim.zero_grad()
+                target_net_updater.step()
+        collector.update_policy_weights_()
+        training_time = time.time() - training_start
+        iteration_time = sampling_time + training_time
+        total_time += iteration_time
+        training_tds = torch.stack(training_tds)
+        # More logs
+        if cfg.logger.backend:
+            log_training(
+                logger,
+                training_tds,
+                tensordict_data,
+                sampling_time,
+                training_time,
+                total_time,
+                i,
+                current_frames,
+                total_frames,
+                step=i,
+            )
+        if (
+            cfg.eval.evaluation_episodes > 0
+            and i % cfg.eval.evaluation_interval == 0
+            and cfg.logger.backend
+        ):
+            evaluation_start = time.time()
+            with torch.no_grad(), set_exploration_type(ExplorationType.DETERMINISTIC):
+                env_test.frames = []
+                rollouts = env_test.rollout(
+                    max_steps=cfg.env.max_steps,
+                    policy=policy,
+                    callback=rendering_callback,
+                    auto_cast_to_device=True,
+                    break_when_any_done=False,
+                    # We are running vectorized evaluation we do not want it to stop when just one env is done
+                )
+                evaluation_time = time.time() - evaluation_start
+                log_evaluation(logger, rollouts, env_test, evaluation_time, step=i)
+        if cfg.logger.backend == "wandb":
+            logger.experiment.log({}, commit=True)
+        sampling_start = time.time()
+    collector.shutdown()
+    if not env.is_closed:
+        env.close()
+    if not env_test.is_closed:
+        env_test.close()
+if __name__ == "__main__":
+    train()

sota-implementations/multiagent/utils/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.

sota-implementations/multiagent/utils/logging.py ADDED Viewed

@@ -0,0 +1,151 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+import os
+import numpy as np
+import torch
+from tensordict import TensorDictBase
+from torchrl.envs.libs.vmas import VmasEnv
+from torchrl.record.loggers import generate_exp_name, get_logger, Logger
+from torchrl.record.loggers.wandb import WandbLogger
+def init_logging(cfg, model_name: str):
+    logger = get_logger(
+        logger_type=cfg.logger.backend,
+        logger_name=os.getcwd(),
+        experiment_name=generate_exp_name(cfg.env.scenario_name, model_name),
+        wandb_kwargs={
+            "group": cfg.logger.group_name or model_name,
+            "project": cfg.logger.project_name
+            or f"torchrl_example_{cfg.env.scenario_name}",
+        },
+    )
+    logger.log_hparams(cfg)
+    return logger
+def log_training(
+    logger: Logger,
+    training_td: TensorDictBase,
+    sampling_td: TensorDictBase,
+    sampling_time: float,
+    training_time: float,
+    total_time: float,
+    iteration: int,
+    current_frames: int,
+    total_frames: int,
+    step: int,
+):
+    if ("next", "agents", "reward") not in sampling_td.keys(True, True):
+        sampling_td.set(
+            ("next", "agents", "reward"),
+            sampling_td.get(("next", "reward"))
+            .expand(sampling_td.get("agents").shape)
+            .unsqueeze(-1),
+        )
+    if ("next", "agents", "episode_reward") not in sampling_td.keys(True, True):
+        sampling_td.set(
+            ("next", "agents", "episode_reward"),
+            sampling_td.get(("next", "episode_reward"))
+            .expand(sampling_td.get("agents").shape)
+            .unsqueeze(-1),
+        )
+    metrics_to_log = {
+        f"train/learner/{key}": value.mean().item()
+        for key, value in training_td.items()
+    }
+    if "info" in sampling_td.get("agents").keys():
+        metrics_to_log.update(
+            {
+                f"train/info/{key}": value.mean().item()
+                for key, value in sampling_td.get(("agents", "info")).items()
+            }
+        )
+    reward = sampling_td.get(("next", "agents", "reward")).mean(-2)  # Mean over agents
+    done = sampling_td.get(("next", "done"))
+    if done.ndim > reward.ndim:
+        done = done[..., 0, :]  # Remove expanded agent dim
+    episode_reward = sampling_td.get(("next", "agents", "episode_reward")).mean(-2)[
+        done
+    ]
+    metrics_to_log.update(
+        {
+            "train/reward/reward_min": reward.min().item(),
+            "train/reward/reward_mean": reward.mean().item(),
+            "train/reward/reward_max": reward.max().item(),
+            "train/reward/episode_reward_min": episode_reward.min().item(),
+            "train/reward/episode_reward_mean": episode_reward.mean().item(),
+            "train/reward/episode_reward_max": episode_reward.max().item(),
+            "train/sampling_time": sampling_time,
+            "train/training_time": training_time,
+            "train/iteration_time": training_time + sampling_time,
+            "train/total_time": total_time,
+            "train/training_iteration": iteration,
+            "train/current_frames": current_frames,
+            "train/total_frames": total_frames,
+        }
+    )
+    if isinstance(logger, WandbLogger):
+        logger.experiment.log(metrics_to_log, commit=False)
+    else:
+        for key, value in metrics_to_log.items():
+            logger.log_scalar(key.replace("/", "_"), value, step=step)
+    return metrics_to_log
+def log_evaluation(
+    logger: WandbLogger,
+    rollouts: TensorDictBase,
+    env_test: VmasEnv,
+    evaluation_time: float,
+    step: int,
+):
+    rollouts = list(rollouts.unbind(0))
+    for k, r in enumerate(rollouts):
+        next_done = r.get(("next", "done")).sum(
+            tuple(range(r.batch_dims, r.get(("next", "done")).ndim)),
+            dtype=torch.bool,
+        )
+        done_index = next_done.nonzero(as_tuple=True)[0][
+            0
+        ]  # First done index for this traj
+        rollouts[k] = r[: done_index + 1]
+    rewards = [td.get(("next", "agents", "reward")).sum(0).mean() for td in rollouts]
+    metrics_to_log = {
+        "eval/episode_reward_min": min(rewards),
+        "eval/episode_reward_max": max(rewards),
+        "eval/episode_reward_mean": sum(rewards) / len(rollouts),
+        "eval/episode_len_mean": sum([td.batch_size[0] for td in rollouts])
+        / len(rollouts),
+        "eval/evaluation_time": evaluation_time,
+    }
+    vid = torch.tensor(
+        np.transpose(env_test.frames[: rollouts[0].batch_size[0]], (0, 3, 1, 2)),
+        dtype=torch.uint8,
+    ).unsqueeze(0)
+    if isinstance(logger, WandbLogger):
+        import wandb
+        logger.experiment.log(metrics_to_log, commit=False)
+        logger.experiment.log(
+            {
+                "eval/video": wandb.Video(vid, fps=1 / env_test.world.dt, format="mp4"),
+            },
+            commit=False,
+        )
+    else:
+        for key, value in metrics_to_log.items():
+            logger.log_scalar(key.replace("/", "_"), value, step=step)
+        logger.log_video("eval_video", vid, step=step)

sota-implementations/multiagent/utils/utils.py ADDED Viewed

@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+from tensordict import unravel_key
+from torchrl.envs import Transform
+def swap_last(source, dest):
+    source = unravel_key(source)
+    dest = unravel_key(dest)
+    if isinstance(source, str):
+        if isinstance(dest, str):
+            return dest
+        return dest[-1]
+    if isinstance(dest, str):
+        return source[:-1] + (dest,)
+    return source[:-1] + (dest[-1],)
+class DoneTransform(Transform):
+    """Expands the 'done' entries (incl. terminated) to match the reward shape.
+    Can be appended to a replay buffer or a collector.
+    """
+    def __init__(self, reward_key, done_keys):
+        super().__init__()
+        self.reward_key = reward_key
+        self.done_keys = done_keys
+    def forward(self, tensordict):
+        for done_key in self.done_keys:
+            new_name = swap_last(self.reward_key, done_key)
+            tensordict.set(
+                ("next", new_name),
+                tensordict.get(("next", done_key))
+                .unsqueeze(-1)
+                .expand(tensordict.get(("next", self.reward_key)).shape),
+            )
+        return tensordict

sota-implementations/ppo/README.md ADDED Viewed

@@ -0,0 +1,29 @@
+## Reproducing Proximal Policy Optimization (PPO) Algorithm Results
+This repository contains scripts that enable training agents using the Proximal Policy Optimization (PPO) Algorithm on MuJoCo and Atari environments. We follow the original paper [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347) by Schulman et al. (2017) to implement the PPO algorithm but introduce the improvement of computing the Generalised Advantage Estimator (GAE) at every epoch.
+## Examples Structure
+Please note that each example is independent of each other for the sake of simplicity. Each example contains the following files:
+1. **Main Script:** The definition of algorithm components and the training loop can be found in the main script  (e.g. ppo_atari.py).
+2. **Utils File:** A utility file is provided to contain various helper functions, generally to create the environment and the models (e.g. utils_atari.py).
+3. **Configuration File:** This file includes default hyperparameters specified in the original paper. Users can modify these hyperparameters to customize their experiments  (e.g. config_atari.yaml).
+## Running the Examples
+You can execute the PPO algorithm on Atari environments by running the following command:
+```bash
+python ppo_atari.py
+```
+You can execute the PPO algorithm on MuJoCo environments by running the following command:
+```bash
+python ppo_mujoco.py
+```