PyPI - torchrl - Versions diffs - 0.11.0__cp314-cp314-manylinux_2_28_aarch64.whl - Mend

torchrl 0.11.0__cp314-cp314-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (394) hide show

benchmarks/benchmark_batched_envs.py +104 -0
benchmarks/conftest.py +91 -0
benchmarks/ecosystem/gym_env_throughput.py +321 -0
benchmarks/ecosystem/vmas_rllib_vs_torchrl_sampling_performance.py +231 -0
benchmarks/requirements.txt +7 -0
benchmarks/storage/benchmark_sample_latency_over_rpc.py +193 -0
benchmarks/test_collectors_benchmark.py +240 -0
benchmarks/test_compressed_storage_benchmark.py +145 -0
benchmarks/test_envs_benchmark.py +133 -0
benchmarks/test_llm.py +101 -0
benchmarks/test_non_tensor_env_benchmark.py +70 -0
benchmarks/test_objectives_benchmarks.py +1199 -0
benchmarks/test_replaybuffer_benchmark.py +254 -0
sota-check/README.md +35 -0
sota-implementations/README.md +142 -0
sota-implementations/a2c/README.md +39 -0
sota-implementations/a2c/a2c_atari.py +291 -0
sota-implementations/a2c/a2c_mujoco.py +273 -0
sota-implementations/a2c/utils_atari.py +240 -0
sota-implementations/a2c/utils_mujoco.py +160 -0
sota-implementations/bandits/README.md +7 -0
sota-implementations/bandits/dqn.py +126 -0
sota-implementations/cql/cql_offline.py +198 -0
sota-implementations/cql/cql_online.py +249 -0
sota-implementations/cql/discrete_cql_offline.py +180 -0
sota-implementations/cql/discrete_cql_online.py +227 -0
sota-implementations/cql/utils.py +471 -0
sota-implementations/crossq/crossq.py +271 -0
sota-implementations/crossq/utils.py +320 -0
sota-implementations/ddpg/ddpg.py +231 -0
sota-implementations/ddpg/utils.py +325 -0
sota-implementations/decision_transformer/dt.py +163 -0
sota-implementations/decision_transformer/lamb.py +167 -0
sota-implementations/decision_transformer/online_dt.py +178 -0
sota-implementations/decision_transformer/utils.py +562 -0
sota-implementations/discrete_sac/discrete_sac.py +243 -0
sota-implementations/discrete_sac/utils.py +324 -0
sota-implementations/dqn/README.md +30 -0
sota-implementations/dqn/dqn_atari.py +272 -0
sota-implementations/dqn/dqn_cartpole.py +236 -0
sota-implementations/dqn/utils_atari.py +132 -0
sota-implementations/dqn/utils_cartpole.py +90 -0
sota-implementations/dreamer/README.md +129 -0
sota-implementations/dreamer/dreamer.py +586 -0
sota-implementations/dreamer/dreamer_utils.py +1107 -0
sota-implementations/expert-iteration/README.md +352 -0
sota-implementations/expert-iteration/ei_utils.py +770 -0
sota-implementations/expert-iteration/expert-iteration-async.py +512 -0
sota-implementations/expert-iteration/expert-iteration-sync.py +508 -0
sota-implementations/expert-iteration/requirements_gsm8k.txt +13 -0
sota-implementations/expert-iteration/requirements_ifeval.txt +16 -0
sota-implementations/gail/gail.py +327 -0
sota-implementations/gail/gail_utils.py +68 -0
sota-implementations/gail/ppo_utils.py +157 -0
sota-implementations/grpo/README.md +273 -0
sota-implementations/grpo/grpo-async.py +437 -0
sota-implementations/grpo/grpo-sync.py +435 -0
sota-implementations/grpo/grpo_utils.py +843 -0
sota-implementations/grpo/requirements_gsm8k.txt +11 -0
sota-implementations/grpo/requirements_ifeval.txt +16 -0
sota-implementations/impala/README.md +33 -0
sota-implementations/impala/impala_multi_node_ray.py +292 -0
sota-implementations/impala/impala_multi_node_submitit.py +284 -0
sota-implementations/impala/impala_single_node.py +261 -0
sota-implementations/impala/utils.py +184 -0
sota-implementations/iql/discrete_iql.py +230 -0
sota-implementations/iql/iql_offline.py +164 -0
sota-implementations/iql/iql_online.py +225 -0
sota-implementations/iql/utils.py +437 -0
sota-implementations/multiagent/README.md +74 -0
sota-implementations/multiagent/iql.py +237 -0
sota-implementations/multiagent/maddpg_iddpg.py +266 -0
sota-implementations/multiagent/mappo_ippo.py +267 -0
sota-implementations/multiagent/qmix_vdn.py +271 -0
sota-implementations/multiagent/sac.py +337 -0
sota-implementations/multiagent/utils/__init__.py +4 -0
sota-implementations/multiagent/utils/logging.py +151 -0
sota-implementations/multiagent/utils/utils.py +43 -0
sota-implementations/ppo/README.md +29 -0
sota-implementations/ppo/ppo_atari.py +305 -0
sota-implementations/ppo/ppo_mujoco.py +293 -0
sota-implementations/ppo/utils_atari.py +238 -0
sota-implementations/ppo/utils_mujoco.py +152 -0
sota-implementations/ppo_trainer/train.py +21 -0
sota-implementations/redq/README.md +7 -0
sota-implementations/redq/redq.py +199 -0
sota-implementations/redq/utils.py +1060 -0
sota-implementations/sac/sac-async.py +266 -0
sota-implementations/sac/sac.py +239 -0
sota-implementations/sac/utils.py +381 -0
sota-implementations/sac_trainer/train.py +16 -0
sota-implementations/td3/td3.py +254 -0
sota-implementations/td3/utils.py +319 -0
sota-implementations/td3_bc/td3_bc.py +177 -0
sota-implementations/td3_bc/utils.py +251 -0
torchrl/__init__.py +144 -0
torchrl/_extension.py +74 -0
torchrl/_torchrl.cpython-314-aarch64-linux-gnu.so +0 -0
torchrl/_utils.py +1431 -0
torchrl/collectors/__init__.py +48 -0
torchrl/collectors/_base.py +1058 -0
torchrl/collectors/_constants.py +88 -0
torchrl/collectors/_multi_async.py +324 -0
torchrl/collectors/_multi_base.py +1805 -0
torchrl/collectors/_multi_sync.py +464 -0
torchrl/collectors/_runner.py +581 -0
torchrl/collectors/_single.py +2009 -0
torchrl/collectors/_single_async.py +259 -0
torchrl/collectors/collectors.py +62 -0
torchrl/collectors/distributed/__init__.py +32 -0
torchrl/collectors/distributed/default_configs.py +133 -0
torchrl/collectors/distributed/generic.py +1306 -0
torchrl/collectors/distributed/ray.py +1092 -0
torchrl/collectors/distributed/rpc.py +1006 -0
torchrl/collectors/distributed/sync.py +731 -0
torchrl/collectors/distributed/utils.py +160 -0
torchrl/collectors/llm/__init__.py +10 -0
torchrl/collectors/llm/base.py +494 -0
torchrl/collectors/llm/ray_collector.py +275 -0
torchrl/collectors/llm/utils.py +36 -0
torchrl/collectors/llm/weight_update/__init__.py +10 -0
torchrl/collectors/llm/weight_update/vllm.py +348 -0
torchrl/collectors/llm/weight_update/vllm_v2.py +311 -0
torchrl/collectors/utils.py +433 -0
torchrl/collectors/weight_update.py +591 -0
torchrl/csrc/numpy_utils.h +38 -0
torchrl/csrc/pybind.cpp +27 -0
torchrl/csrc/segment_tree.h +458 -0
torchrl/csrc/torch_utils.h +34 -0
torchrl/csrc/utils.cpp +48 -0
torchrl/csrc/utils.h +31 -0
torchrl/data/__init__.py +187 -0
torchrl/data/datasets/__init__.py +58 -0
torchrl/data/datasets/atari_dqn.py +878 -0
torchrl/data/datasets/common.py +281 -0
torchrl/data/datasets/d4rl.py +489 -0
torchrl/data/datasets/d4rl_infos.py +187 -0
torchrl/data/datasets/gen_dgrl.py +375 -0
torchrl/data/datasets/minari_data.py +643 -0
torchrl/data/datasets/openml.py +177 -0
torchrl/data/datasets/openx.py +798 -0
torchrl/data/datasets/roboset.py +363 -0
torchrl/data/datasets/utils.py +11 -0
torchrl/data/datasets/vd4rl.py +432 -0
torchrl/data/llm/__init__.py +34 -0
torchrl/data/llm/dataset.py +491 -0
torchrl/data/llm/history.py +1378 -0
torchrl/data/llm/prompt.py +198 -0
torchrl/data/llm/reward.py +225 -0
torchrl/data/llm/topk.py +186 -0
torchrl/data/llm/utils.py +543 -0
torchrl/data/map/__init__.py +21 -0
torchrl/data/map/hash.py +185 -0
torchrl/data/map/query.py +204 -0
torchrl/data/map/tdstorage.py +363 -0
torchrl/data/map/tree.py +1434 -0
torchrl/data/map/utils.py +103 -0
torchrl/data/postprocs/__init__.py +8 -0
torchrl/data/postprocs/postprocs.py +391 -0
torchrl/data/replay_buffers/__init__.py +99 -0
torchrl/data/replay_buffers/checkpointers.py +622 -0
torchrl/data/replay_buffers/ray_buffer.py +292 -0
torchrl/data/replay_buffers/replay_buffers.py +2376 -0
torchrl/data/replay_buffers/samplers.py +2578 -0
torchrl/data/replay_buffers/scheduler.py +265 -0
torchrl/data/replay_buffers/storages.py +2412 -0
torchrl/data/replay_buffers/utils.py +1042 -0
torchrl/data/replay_buffers/writers.py +781 -0
torchrl/data/tensor_specs.py +7101 -0
torchrl/data/utils.py +334 -0
torchrl/envs/__init__.py +265 -0
torchrl/envs/async_envs.py +1105 -0
torchrl/envs/batched_envs.py +3093 -0
torchrl/envs/common.py +4241 -0
torchrl/envs/custom/__init__.py +11 -0
torchrl/envs/custom/chess.py +617 -0
torchrl/envs/custom/llm.py +214 -0
torchrl/envs/custom/pendulum.py +401 -0
torchrl/envs/custom/san_moves.txt +29274 -0
torchrl/envs/custom/tictactoeenv.py +288 -0
torchrl/envs/env_creator.py +263 -0
torchrl/envs/gym_like.py +752 -0
torchrl/envs/libs/__init__.py +68 -0
torchrl/envs/libs/_gym_utils.py +326 -0
torchrl/envs/libs/brax.py +846 -0
torchrl/envs/libs/dm_control.py +544 -0
torchrl/envs/libs/envpool.py +447 -0
torchrl/envs/libs/gym.py +2239 -0
torchrl/envs/libs/habitat.py +138 -0
torchrl/envs/libs/isaac_lab.py +87 -0
torchrl/envs/libs/isaacgym.py +203 -0
torchrl/envs/libs/jax_utils.py +166 -0
torchrl/envs/libs/jumanji.py +963 -0
torchrl/envs/libs/meltingpot.py +599 -0
torchrl/envs/libs/openml.py +153 -0
torchrl/envs/libs/openspiel.py +652 -0
torchrl/envs/libs/pettingzoo.py +1042 -0
torchrl/envs/libs/procgen.py +351 -0
torchrl/envs/libs/robohive.py +429 -0
torchrl/envs/libs/smacv2.py +645 -0
torchrl/envs/libs/unity_mlagents.py +891 -0
torchrl/envs/libs/utils.py +147 -0
torchrl/envs/libs/vmas.py +813 -0
torchrl/envs/llm/__init__.py +63 -0
torchrl/envs/llm/chat.py +730 -0
torchrl/envs/llm/datasets/README.md +4 -0
torchrl/envs/llm/datasets/__init__.py +17 -0
torchrl/envs/llm/datasets/gsm8k.py +353 -0
torchrl/envs/llm/datasets/ifeval.py +274 -0
torchrl/envs/llm/envs.py +789 -0
torchrl/envs/llm/libs/README.md +3 -0
torchrl/envs/llm/libs/__init__.py +8 -0
torchrl/envs/llm/libs/mlgym.py +869 -0
torchrl/envs/llm/reward/__init__.py +10 -0
torchrl/envs/llm/reward/gsm8k.py +324 -0
torchrl/envs/llm/reward/ifeval/README.md +13 -0
torchrl/envs/llm/reward/ifeval/__init__.py +10 -0
torchrl/envs/llm/reward/ifeval/_instructions.py +1667 -0
torchrl/envs/llm/reward/ifeval/_instructions_main.py +131 -0
torchrl/envs/llm/reward/ifeval/_instructions_registry.py +100 -0
torchrl/envs/llm/reward/ifeval/_instructions_util.py +1677 -0
torchrl/envs/llm/reward/ifeval/_scorer.py +454 -0
torchrl/envs/llm/transforms/__init__.py +55 -0
torchrl/envs/llm/transforms/browser.py +292 -0
torchrl/envs/llm/transforms/dataloading.py +859 -0
torchrl/envs/llm/transforms/format.py +73 -0
torchrl/envs/llm/transforms/kl.py +1544 -0
torchrl/envs/llm/transforms/policy_version.py +189 -0
torchrl/envs/llm/transforms/reason.py +323 -0
torchrl/envs/llm/transforms/tokenizer.py +321 -0
torchrl/envs/llm/transforms/tools.py +1955 -0
torchrl/envs/model_based/__init__.py +9 -0
torchrl/envs/model_based/common.py +180 -0
torchrl/envs/model_based/dreamer.py +112 -0
torchrl/envs/transforms/__init__.py +147 -0
torchrl/envs/transforms/functional.py +48 -0
torchrl/envs/transforms/gym_transforms.py +203 -0
torchrl/envs/transforms/module.py +341 -0
torchrl/envs/transforms/r3m.py +372 -0
torchrl/envs/transforms/ray_service.py +663 -0
torchrl/envs/transforms/rb_transforms.py +214 -0
torchrl/envs/transforms/transforms.py +11835 -0
torchrl/envs/transforms/utils.py +94 -0
torchrl/envs/transforms/vc1.py +307 -0
torchrl/envs/transforms/vecnorm.py +845 -0
torchrl/envs/transforms/vip.py +407 -0
torchrl/envs/utils.py +1718 -0
torchrl/envs/vec_envs.py +11 -0
torchrl/modules/__init__.py +206 -0
torchrl/modules/distributions/__init__.py +73 -0
torchrl/modules/distributions/continuous.py +830 -0
torchrl/modules/distributions/discrete.py +908 -0
torchrl/modules/distributions/truncated_normal.py +187 -0
torchrl/modules/distributions/utils.py +233 -0
torchrl/modules/llm/__init__.py +62 -0
torchrl/modules/llm/backends/__init__.py +65 -0
torchrl/modules/llm/backends/vllm/__init__.py +94 -0
torchrl/modules/llm/backends/vllm/_models.py +46 -0
torchrl/modules/llm/backends/vllm/base.py +72 -0
torchrl/modules/llm/backends/vllm/vllm_async.py +2075 -0
torchrl/modules/llm/backends/vllm/vllm_plugin.py +22 -0
torchrl/modules/llm/backends/vllm/vllm_sync.py +446 -0
torchrl/modules/llm/backends/vllm/vllm_utils.py +129 -0
torchrl/modules/llm/policies/__init__.py +28 -0
torchrl/modules/llm/policies/common.py +1809 -0
torchrl/modules/llm/policies/transformers_wrapper.py +2756 -0
torchrl/modules/llm/policies/vllm_wrapper.py +2241 -0
torchrl/modules/llm/utils.py +23 -0
torchrl/modules/mcts/__init__.py +21 -0
torchrl/modules/mcts/scores.py +579 -0
torchrl/modules/models/__init__.py +86 -0
torchrl/modules/models/batchrenorm.py +119 -0
torchrl/modules/models/decision_transformer.py +179 -0
torchrl/modules/models/exploration.py +731 -0
torchrl/modules/models/llm.py +156 -0
torchrl/modules/models/model_based.py +596 -0
torchrl/modules/models/models.py +1712 -0
torchrl/modules/models/multiagent.py +1067 -0
torchrl/modules/models/recipes/impala.py +185 -0
torchrl/modules/models/utils.py +162 -0
torchrl/modules/planners/__init__.py +10 -0
torchrl/modules/planners/cem.py +228 -0
torchrl/modules/planners/common.py +73 -0
torchrl/modules/planners/mppi.py +265 -0
torchrl/modules/tensordict_module/__init__.py +89 -0
torchrl/modules/tensordict_module/actors.py +2457 -0
torchrl/modules/tensordict_module/common.py +529 -0
torchrl/modules/tensordict_module/exploration.py +814 -0
torchrl/modules/tensordict_module/probabilistic.py +321 -0
torchrl/modules/tensordict_module/rnn.py +1639 -0
torchrl/modules/tensordict_module/sequence.py +132 -0
torchrl/modules/tensordict_module/world_models.py +34 -0
torchrl/modules/utils/__init__.py +38 -0
torchrl/modules/utils/mappings.py +9 -0
torchrl/modules/utils/utils.py +89 -0
torchrl/objectives/__init__.py +78 -0
torchrl/objectives/a2c.py +659 -0
torchrl/objectives/common.py +753 -0
torchrl/objectives/cql.py +1346 -0
torchrl/objectives/crossq.py +710 -0
torchrl/objectives/ddpg.py +453 -0
torchrl/objectives/decision_transformer.py +371 -0
torchrl/objectives/deprecated.py +516 -0
torchrl/objectives/dqn.py +683 -0
torchrl/objectives/dreamer.py +488 -0
torchrl/objectives/functional.py +48 -0
torchrl/objectives/gail.py +258 -0
torchrl/objectives/iql.py +996 -0
torchrl/objectives/llm/__init__.py +30 -0
torchrl/objectives/llm/grpo.py +846 -0
torchrl/objectives/llm/sft.py +482 -0
torchrl/objectives/multiagent/__init__.py +8 -0
torchrl/objectives/multiagent/qmixer.py +396 -0
torchrl/objectives/ppo.py +1669 -0
torchrl/objectives/redq.py +683 -0
torchrl/objectives/reinforce.py +530 -0
torchrl/objectives/sac.py +1580 -0
torchrl/objectives/td3.py +570 -0
torchrl/objectives/td3_bc.py +625 -0
torchrl/objectives/utils.py +782 -0
torchrl/objectives/value/__init__.py +28 -0
torchrl/objectives/value/advantages.py +1956 -0
torchrl/objectives/value/functional.py +1459 -0
torchrl/objectives/value/utils.py +360 -0
torchrl/record/__init__.py +17 -0
torchrl/record/loggers/__init__.py +23 -0
torchrl/record/loggers/common.py +48 -0
torchrl/record/loggers/csv.py +226 -0
torchrl/record/loggers/mlflow.py +142 -0
torchrl/record/loggers/tensorboard.py +139 -0
torchrl/record/loggers/trackio.py +163 -0
torchrl/record/loggers/utils.py +78 -0
torchrl/record/loggers/wandb.py +214 -0
torchrl/record/recorder.py +554 -0
torchrl/services/__init__.py +79 -0
torchrl/services/base.py +109 -0
torchrl/services/ray_service.py +453 -0
torchrl/testing/__init__.py +107 -0
torchrl/testing/assertions.py +179 -0
torchrl/testing/dist_utils.py +122 -0
torchrl/testing/env_creators.py +227 -0
torchrl/testing/env_helper.py +35 -0
torchrl/testing/gym_helpers.py +156 -0
torchrl/testing/llm_mocks.py +119 -0
torchrl/testing/mocking_classes.py +2720 -0
torchrl/testing/modules.py +295 -0
torchrl/testing/mp_helpers.py +15 -0
torchrl/testing/ray_helpers.py +293 -0
torchrl/testing/utils.py +190 -0
torchrl/trainers/__init__.py +42 -0
torchrl/trainers/algorithms/__init__.py +11 -0
torchrl/trainers/algorithms/configs/__init__.py +705 -0
torchrl/trainers/algorithms/configs/collectors.py +216 -0
torchrl/trainers/algorithms/configs/common.py +41 -0
torchrl/trainers/algorithms/configs/data.py +308 -0
torchrl/trainers/algorithms/configs/envs.py +104 -0
torchrl/trainers/algorithms/configs/envs_libs.py +361 -0
torchrl/trainers/algorithms/configs/logging.py +80 -0
torchrl/trainers/algorithms/configs/modules.py +570 -0
torchrl/trainers/algorithms/configs/objectives.py +177 -0
torchrl/trainers/algorithms/configs/trainers.py +340 -0
torchrl/trainers/algorithms/configs/transforms.py +955 -0
torchrl/trainers/algorithms/configs/utils.py +252 -0
torchrl/trainers/algorithms/configs/weight_sync_schemes.py +191 -0
torchrl/trainers/algorithms/configs/weight_update.py +159 -0
torchrl/trainers/algorithms/ppo.py +373 -0
torchrl/trainers/algorithms/sac.py +308 -0
torchrl/trainers/helpers/__init__.py +40 -0
torchrl/trainers/helpers/collectors.py +416 -0
torchrl/trainers/helpers/envs.py +573 -0
torchrl/trainers/helpers/logger.py +33 -0
torchrl/trainers/helpers/losses.py +132 -0
torchrl/trainers/helpers/models.py +658 -0
torchrl/trainers/helpers/replay_buffer.py +59 -0
torchrl/trainers/helpers/trainers.py +301 -0
torchrl/trainers/trainers.py +2052 -0
torchrl/weight_update/__init__.py +33 -0
torchrl/weight_update/_distributed.py +749 -0
torchrl/weight_update/_mp.py +624 -0
torchrl/weight_update/_noupdate.py +102 -0
torchrl/weight_update/_ray.py +1032 -0
torchrl/weight_update/_rpc.py +284 -0
torchrl/weight_update/_shared.py +891 -0
torchrl/weight_update/llm/__init__.py +32 -0
torchrl/weight_update/llm/vllm_double_buffer.py +370 -0
torchrl/weight_update/llm/vllm_nccl.py +710 -0
torchrl/weight_update/utils.py +73 -0
torchrl/weight_update/weight_sync_schemes.py +1244 -0
torchrl-0.11.0.dist-info/METADATA +1308 -0
torchrl-0.11.0.dist-info/RECORD +394 -0
torchrl-0.11.0.dist-info/WHEEL +5 -0
torchrl-0.11.0.dist-info/entry_points.txt +2 -0
torchrl-0.11.0.dist-info/licenses/LICENSE +21 -0
torchrl-0.11.0.dist-info/top_level.txt +7 -0

torchrl/envs/libs/brax.py ADDED Viewed

@@ -0,0 +1,846 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+import importlib.util
+import warnings
+import torch
+from packaging import version
+from tensordict import TensorDict, TensorDictBase
+from torchrl.data.tensor_specs import Bounded, Composite, Unbounded
+from torchrl.envs.batched_envs import ParallelEnv
+from torchrl.envs.common import _EnvPostInit, _EnvWrapper
+from torchrl.envs.libs.jax_utils import (
+    _extract_spec,
+    _ndarray_to_tensor,
+    _object_to_tensordict,
+    _tensor_to_ndarray,
+    _tensordict_to_object,
+    _tree_flatten,
+    _tree_reshape,
+)
+from torchrl.envs.utils import _classproperty
+_has_brax = importlib.util.find_spec("brax") is not None
+_DEFAULT_CACHE_CLEAR_FREQUENCY = 20
+def _get_envs():
+    if not _has_brax:
+        raise ImportError("BRAX is not installed in your virtual environment.")
+    import brax.envs
+    return list(brax.envs._envs.keys())
+class _BraxMeta(_EnvPostInit):
+    """Metaclass for BraxEnv that returns a lazy ParallelEnv when num_workers > 1."""
+    def __call__(cls, *args, num_workers: int | None = None, **kwargs):
+        # Extract num_workers from explicit kwarg or kwargs dict
+        if num_workers is None:
+            num_workers = kwargs.pop("num_workers", 1)
+        else:
+            kwargs.pop("num_workers", None)
+        num_workers = int(num_workers)
+        if cls.__name__ == "BraxEnv" and num_workers > 1:
+            # Extract env_name from args or kwargs
+            env_name = args[0] if len(args) >= 1 else kwargs.get("env_name")
+            # Remove env_name from kwargs if present (it will be passed positionally)
+            env_kwargs = {k: v for k, v in kwargs.items() if k != "env_name"}
+            # Create factory function that builds single BraxEnv instances
+            def make_env(_env_name=env_name, _kwargs=env_kwargs):
+                return cls(_env_name, num_workers=1, **_kwargs)
+            # Return lazy ParallelEnv (workers not started yet)
+            return ParallelEnv(num_workers, make_env)
+        return super().__call__(*args, **kwargs)
+class BraxWrapper(_EnvWrapper):
+    """Google Brax environment wrapper.
+    Brax offers a vectorized and differentiable simulation framework based on Jax.
+    TorchRL's wrapper incurs some overhead for the jax-to-torch conversion,
+    but computational graphs can still be built on top of the simulated trajectories,
+    allowing for backpropagation through the rollout.
+    GitHub: https://github.com/google/brax
+    Paper: https://arxiv.org/abs/2106.13281
+    Args:
+        env (brax.envs.base.PipelineEnv): the environment to wrap.
+        categorical_action_encoding (bool, optional): if ``True``, categorical
+            specs will be converted to the TorchRL equivalent (:class:`torchrl.data.Categorical`),
+            otherwise a one-hot encoding will be used (:class:`torchrl.data.OneHot`).
+            Defaults to ``False``.
+        cache_clear_frequency (int, optional): automatically clear JAX's internal
+            cache every N steps to prevent memory leaks when using ``requires_grad=True``.
+            Defaults to `False` (deactivates automatic cache clearing).
+    Keyword Args:
+        from_pixels (bool, optional): Not yet supported.
+        frame_skip (int, optional): if provided, indicates for how many steps the
+            same action is to be repeated. The observation returned will be the
+            last observation of the sequence, whereas the reward will be the sum
+            of rewards across steps.
+        device (torch.device, optional): if provided, the device on which the data
+            is to be cast. Defaults to ``torch.device("cpu")``.
+        batch_size (torch.Size, optional): the batch size of the environment.
+            In ``brax``, this controls the number of environments simulated in
+            parallel via JAX's ``vmap`` on a single device (GPU/TPU). Brax leverages
+            MuJoCo XLA (MJX) for hardware-accelerated batched simulation, enabling
+            thousands of environments to run in parallel within a single process.
+            Defaults to ``torch.Size([])``.
+        allow_done_after_reset (bool, optional): if ``True``, it is tolerated
+            for envs to be ``done`` just after :meth:`reset` is called.
+            Defaults to ``False``.
+    Attributes:
+        available_envs: environments available to build
+    Examples:
+        >>> import brax.envs
+        >>> from torchrl.envs import BraxWrapper
+        >>> import torch
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+        >>> base_env = brax.envs.get_environment("ant")
+        >>> env = BraxWrapper(base_env, device=device)
+        >>> env.set_seed(0)
+        >>> td = env.reset()
+        >>> td["action"] = env.action_spec.rand()
+        >>> td = env.step(td)
+        >>> print(td)
+        TensorDict(
+            fields={
+                action: Tensor(torch.Size([8]), dtype=torch.float32),
+                done: Tensor(torch.Size([1]), dtype=torch.bool),
+                next: TensorDict(
+                    fields={
+                        observation: Tensor(torch.Size([87]), dtype=torch.float32)},
+                    batch_size=torch.Size([]),
+                    device=cpu,
+                    is_shared=False),
+                observation: Tensor(torch.Size([87]), dtype=torch.float32),
+                reward: Tensor(torch.Size([1]), dtype=torch.float32),
+                state: TensorDict(...)},
+            batch_size=torch.Size([]),
+            device=cpu,
+            is_shared=False)
+        >>> print(env.available_envs)
+        ['acrobot', 'ant', 'fast', 'fetch', ...]
+    To take advante of Brax, one usually executes multiple environments at the
+    same time. In the following example, we iteratively test different batch sizes
+    and report the execution time for a short rollout:
+    Examples:
+        >>> import torch
+        >>> from torch.utils.benchmark import Timer
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+        >>> for batch_size in [4, 16, 128]:
+        ...     timer = Timer('''
+        ... env.rollout(100)
+        ... ''',
+        ...     setup=f'''
+        ... import brax.envs
+        ... from torchrl.envs import BraxWrapper
+        ... env = BraxWrapper(brax.envs.get_environment("ant"), batch_size=[{batch_size}], device="{device}")
+        ... env.set_seed(0)
+        ... env.rollout(2)
+        ... ''')
+        ...     print(batch_size, timer.timeit(10))
+        4
+        env.rollout(100)
+        setup: [...]
+        310.00 ms
+        1 measurement, 10 runs , 1 thread
+        16
+        env.rollout(100)
+        setup: [...]
+        268.46 ms
+        1 measurement, 10 runs , 1 thread
+        128
+        env.rollout(100)
+        setup: [...]
+        433.80 ms
+        1 measurement, 10 runs , 1 thread
+    One can backpropagate through the rollout and optimize the policy directly:
+        >>> import brax.envs
+        >>> from torchrl.envs import BraxWrapper
+        >>> from tensordict.nn import TensorDictModule
+        >>> from torch import nn
+        >>> import torch
+        >>>
+        >>> env = BraxWrapper(brax.envs.get_environment("ant"), batch_size=[10], requires_grad=True, cache_clear_frequency=100)
+        >>> env.set_seed(0)
+        >>> torch.manual_seed(0)
+        >>> policy = TensorDictModule(nn.Linear(27, 8), in_keys=["observation"], out_keys=["action"])
+        >>>
+        >>> td = env.rollout(10, policy)
+        >>>
+        >>> td["next", "reward"].mean().backward(retain_graph=True)
+        >>> print(policy.module.weight.grad.norm())
+        tensor(213.8605)
+    """
+    git_url = "https://github.com/google/brax"
+    @_classproperty
+    def available_envs(cls):
+        if not _has_brax:
+            return []
+        return list(_get_envs())
+    libname = "brax"
+    _lib = None
+    _jax = None
+    @_classproperty
+    def lib(cls):
+        if cls._lib is not None:
+            return cls._lib
+        import brax
+        import brax.envs
+        cls._lib = brax
+        return brax
+    @_classproperty
+    def jax(cls):
+        if cls._jax is not None:
+            return cls._jax
+        import jax
+        cls._jax = jax
+        return jax
+    def __init__(
+        self,
+        env=None,
+        categorical_action_encoding=False,
+        cache_clear_frequency: int | None = None,
+        **kwargs,
+    ):
+        if env is not None:
+            kwargs["env"] = env
+        self._seed_calls_reset = None
+        self._categorical_action_encoding = categorical_action_encoding
+        # If user passes None or False, deactivate automatic cache clearing
+        if cache_clear_frequency in (False,):
+            self._cache_clear_frequency = False
+        elif cache_clear_frequency in (None, True):
+            self._cache_clear_frequency = _DEFAULT_CACHE_CLEAR_FREQUENCY
+        else:
+            self._cache_clear_frequency = cache_clear_frequency
+        self._step_count = 0
+        super().__init__(**kwargs)
+        if not self.device:
+            warnings.warn(
+                f"No device is set for env {self}. "
+                f"Setting a device in Brax wrapped environments is strongly recommended."
+            )
+    def _check_kwargs(self, kwargs: dict):
+        brax = self.lib
+        if version.parse(brax.__version__) < version.parse("0.10.4"):
+            raise ImportError("Brax v0.10.4 or greater is required.")
+        if "env" not in kwargs:
+            raise TypeError("Could not find environment key 'env' in kwargs.")
+        env = kwargs["env"]
+        if not isinstance(env, brax.envs.Env):
+            raise TypeError("env is not of type 'brax.envs.Env'.")
+    def _build_env(
+        self,
+        env,
+        _seed: int | None = None,
+        from_pixels: bool = False,
+        render_kwargs: dict | None = None,
+        pixels_only: bool = False,
+        requires_grad: bool = False,
+        camera_id: int | str = 0,
+        **kwargs,
+    ):
+        self.from_pixels = from_pixels
+        self.pixels_only = pixels_only
+        self.requires_grad = requires_grad
+        if from_pixels:
+            raise NotImplementedError(
+                "from_pixels=True is not yest supported within BraxWrapper"
+            )
+        return env
+    def _make_state_spec(self, env: brax.envs.env.Env):  # noqa: F821
+        jax = self.jax
+        key = jax.random.PRNGKey(0)
+        state = env.reset(key)
+        state_dict = _object_to_tensordict(state, self.device, batch_size=())
+        state_spec = _extract_spec(state_dict).expand(self.batch_size)
+        return state_spec
+    def _make_specs(self, env: brax.envs.env.Env) -> None:  # noqa: F821
+        self.action_spec = Bounded(
+            low=-1,
+            high=1,
+            shape=(
+                *self.batch_size,
+                env.action_size,
+            ),
+            device=self.device,
+        )
+        self.reward_spec = Unbounded(
+            shape=[
+                *self.batch_size,
+                1,
+            ],
+            device=self.device,
+        )
+        self.observation_spec = Composite(
+            observation=Unbounded(
+                shape=(
+                    *self.batch_size,
+                    env.observation_size,
+                ),
+                device=self.device,
+            ),
+            shape=self.batch_size,
+        )
+        # extract state spec from instance
+        state_spec = self._make_state_spec(env)
+        self.state_spec["state"] = state_spec
+        self.observation_spec["state"] = state_spec.clone()
+    def _make_state_example(self):
+        jax = self.jax
+        key = jax.random.PRNGKey(0)
+        keys = jax.random.split(key, self.batch_size.numel())
+        state = self._vmap_jit_env_reset(jax.numpy.stack(keys))
+        state = _tree_reshape(state, self.batch_size)
+        return state
+    def _init_env(self) -> int | None:
+        jax = self.jax
+        self._key = None
+        self._vmap_jit_env_reset = jax.vmap(jax.jit(self._env.reset))
+        self._vmap_jit_env_step = jax.vmap(jax.jit(self._env.step))
+        self._state_example = self._make_state_example()
+    def _set_seed(self, seed: int | None) -> None:
+        jax = self.jax
+        if seed is None:
+            raise Exception("Brax requires an integer seed.")
+        self._key = jax.random.PRNGKey(seed)
+    def _reset(self, tensordict: TensorDictBase = None, **kwargs) -> TensorDictBase:
+        jax = self.jax
+        # ensure a valid JAX PRNG key exists
+        if getattr(self, "_key", None) is None:
+            seed = getattr(self, "_seed", None)
+            if seed is None:
+                seed = 0
+            self._key = jax.random.PRNGKey(int(seed))
+        # generate random keys
+        self._key, *keys = jax.random.split(self._key, 1 + self.numel())
+        # call env reset with jit and vmap
+        state = self._vmap_jit_env_reset(jax.numpy.stack(keys))
+        # reshape batch size
+        state = _tree_reshape(state, self.batch_size)
+        state = _object_to_tensordict(state, self.device, self.batch_size)
+        # build result
+        state["reward"] = state.get("reward").view(*self.reward_spec.shape)
+        state["done"] = state.get("done").view(*self.reward_spec.shape)
+        done = state["done"].bool()
+        tensordict_out = TensorDict._new_unsafe(
+            source={
+                "observation": state.get("obs"),
+                # "reward": reward,
+                "done": done,
+                "terminated": done.clone(),
+                "state": state,
+            },
+            batch_size=self.batch_size,
+            device=self.device,
+        )
+        return tensordict_out
+    def _step_without_grad(self, tensordict: TensorDictBase):
+        # convert tensors to ndarrays
+        state = _tensordict_to_object(tensordict.get("state"), self._state_example)
+        action = _tensor_to_ndarray(tensordict.get("action"))
+        # flatten batch size
+        state = _tree_flatten(state, self.batch_size)
+        action = _tree_flatten(action, self.batch_size)
+        # call env step with jit and vmap
+        next_state = self._vmap_jit_env_step(state, action)
+        # reshape batch size and convert ndarrays to tensors
+        next_state = _tree_reshape(next_state, self.batch_size)
+        next_state = _object_to_tensordict(next_state, self.device, self.batch_size)
+        # build result
+        next_state.set("reward", next_state.get("reward").view(self.reward_spec.shape))
+        next_state.set("done", next_state.get("done").view(self.reward_spec.shape))
+        done = next_state["done"].bool()
+        reward = next_state["reward"]
+        tensordict_out = TensorDict._new_unsafe(
+            source={
+                "observation": next_state.get("obs"),
+                "reward": reward,
+                "done": done,
+                "terminated": done.clone(),
+                "state": next_state,
+            },
+            batch_size=self.batch_size,
+            device=self.device,
+        )
+        return tensordict_out
+    def _step_with_grad(self, tensordict: TensorDictBase):
+        # convert tensors to ndarrays
+        action = tensordict.get("action")
+        state = tensordict.get("state")
+        qp_keys, qp_values = zip(*state.get("pipeline_state").items())
+        # call env step with autograd function
+        next_state_nograd, next_obs, next_reward, *next_qp_values = _BraxEnvStep.apply(
+            self, state, action, *qp_values
+        )
+        # extract done values: we assume a shape identical to reward
+        next_done = next_state_nograd.get("done").view(*self.reward_spec.shape)
+        next_reward = next_reward.view(*self.reward_spec.shape)
+        # merge with tensors with grad function
+        next_state = next_state_nograd
+        next_state["obs"] = next_obs
+        next_state.set("reward", next_reward)
+        next_state.set("done", next_done)
+        next_done = next_done.bool()
+        next_state.get("pipeline_state").update(dict(zip(qp_keys, next_qp_values)))
+        # build result
+        tensordict_out = TensorDict._new_unsafe(
+            source={
+                "observation": next_obs,
+                "reward": next_reward,
+                "done": next_done,
+                "terminated": next_done,
+                "state": next_state,
+            },
+            batch_size=self.batch_size,
+            device=self.device,
+        )
+        return tensordict_out
+    def _step(
+        self,
+        tensordict: TensorDictBase,
+    ) -> TensorDictBase:
+        if self.requires_grad:
+            out = self._step_with_grad(tensordict)
+        else:
+            out = self._step_without_grad(tensordict)
+        self._step_count += 1
+        if (
+            self._cache_clear_frequency
+            and (self._step_count % self._cache_clear_frequency) == 0
+        ):
+            self.clear_cache()
+        return out
+    def clear_cache(self):
+        """Clear JAX's internal cache to prevent memory leaks.
+        This method should be called periodically when using requires_grad=True
+        to prevent memory accumulation from JAX's internal computation graph.
+        """
+        if hasattr(self, "jax"):
+            try:
+                # Clear JAX's compilation cache
+                if hasattr(self.jax.jit, "clear_caches"):
+                    self.jax.jit.clear_caches()
+                # Alternative: clear JAX's internal cache
+                if hasattr(self.jax, "clear_caches"):
+                    self.jax.clear_caches()
+                # Clear JAX's XLA compilation cache if available
+                try:
+                    import jaxlib
+                    if hasattr(jaxlib, "xla_extension"):
+                        jaxlib.xla_extension.clear_caches()
+                except Exception:
+                    pass
+            except Exception:
+                pass
+class BraxEnv(BraxWrapper, metaclass=_BraxMeta):
+    """Google Brax environment wrapper built with the environment name.
+    Brax offers a vectorized and differentiable simulation framework based on Jax.
+    TorchRL's wrapper incurs some overhead for the jax-to-torch conversion,
+    but computational graphs can still be built on top of the simulated trajectories,
+    allowing for backpropagation through the rollout.
+    GitHub: https://github.com/google/brax
+    Paper: https://arxiv.org/abs/2106.13281
+    Args:
+        env_name (str): the environment name of the env to wrap. Must be part of
+            :attr:`~.available_envs`.
+        categorical_action_encoding (bool, optional): if ``True``, categorical
+            specs will be converted to the TorchRL equivalent (:class:`torchrl.data.Categorical`),
+            otherwise a one-hot encoding will be used (:class:`torchrl.data.OneHot`).
+            Defaults to ``False``.
+        cache_clear_frequency (int, optional): automatically clear JAX's internal
+            cache every N steps to prevent memory leaks when using ``requires_grad=True``.
+            Defaults to `False` (deactivates automatic cache clearing).
+    Keyword Args:
+        from_pixels (bool, optional): Not yet supported.
+        frame_skip (int, optional): if provided, indicates for how many steps the
+            same action is to be repeated. The observation returned will be the
+            last observation of the sequence, whereas the reward will be the sum
+            of rewards across steps.
+        device (torch.device, optional): if provided, the device on which the data
+            is to be cast. Defaults to ``torch.device("cpu")``.
+        batch_size (torch.Size, optional): the batch size of the environment.
+            In ``brax``, this controls the number of environments simulated in
+            parallel via JAX's ``vmap`` on a single device (GPU/TPU). Brax leverages
+            MuJoCo XLA (MJX) for hardware-accelerated batched simulation, enabling
+            thousands of environments to run in parallel within a single process.
+            Defaults to ``torch.Size([])``.
+        allow_done_after_reset (bool, optional): if ``True``, it is tolerated
+            for envs to be ``done`` just after :meth:`reset` is called.
+            Defaults to ``False``.
+        num_workers (int, optional): if greater than 1, a lazy :class:`~torchrl.envs.ParallelEnv`
+            will be returned instead, with each worker instantiating its own
+            :class:`~torchrl.envs.BraxEnv` instance. Defaults to ``None``.
+    .. note::
+        There are two orthogonal ways to scale environment throughput:
+        - **batch_size**: Uses Brax's native JAX-based vectorization (``vmap``) to run
+          multiple environments in parallel on a single GPU/TPU. This is highly efficient
+          for moderate batch sizes where the MJX solver has not yet saturated.
+        - **num_workers**: Uses TorchRL's :class:`~torchrl.envs.ParallelEnv` to spawn
+          multiple Python processes, each running its own ``BraxEnv``.
+        These can be combined: ``BraxEnv("ant", batch_size=[128], num_workers=4)`` creates
+        4 worker processes, each running 128 vectorized environments, for a total of 512
+        parallel environments. This hybrid approach can be beneficial when the MJX solver
+        saturates on a single device, or when distributing across multiple GPUs/CPUs.
+    Attributes:
+        available_envs: environments available to build
+    Examples:
+        >>> from torchrl.envs import BraxEnv
+        >>> import torch
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+        >>> env = BraxEnv("ant", device=device)
+        >>> env.set_seed(0)
+        >>> td = env.reset()
+        >>> td["action"] = env.action_spec.rand()
+        >>> td = env.step(td)
+        >>> print(td)
+        TensorDict(
+            fields={
+                action: Tensor(torch.Size([8]), dtype=torch.float32),
+                done: Tensor(torch.Size([1]), dtype=torch.bool),
+                next: TensorDict(
+                    fields={
+                        observation: Tensor(torch.Size([87]), dtype=torch.float32)},
+                    batch_size=torch.Size([]),
+                    device=cpu,
+                    is_shared=False),
+                observation: Tensor(torch.Size([87]), dtype=torch.float32),
+                reward: Tensor(torch.Size([1]), dtype=torch.float32),
+                state: TensorDict(...)},
+            batch_size=torch.Size([]),
+            device=cpu,
+            is_shared=False)
+        >>> print(env.available_envs)
+        ['acrobot', 'ant', 'fast', 'fetch', ...]
+        # Example: create a parallel environment with 4 workers. This returns a lazy
+        # ParallelEnv; each worker will instantiate a BraxEnv with num_workers=1.
+        >>> from torchrl.envs import BraxEnv
+        >>> par_env = BraxEnv("ant", batch_size=[8], num_workers=4, device="cpu")
+        >>> # par_env is a ParallelEnv; start interacting as usual
+        >>> par_env.set_seed(0)
+        >>> td = par_env.reset()
+        >>> print(td.shape)
+        torch.Size([4, 8])
+        >>> td["action"] = par_env.action_spec.rand()
+        >>> td = par_env.step(td)
+    To take advante of Brax, one usually executes multiple environments at the
+    same time. In the following example, we iteratively test different batch sizes
+    and report the execution time for a short rollout:
+    Examples:
+        >>> import torch
+        >>> from torch.utils.benchmark import Timer
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+        >>> for batch_size in [4, 16, 128]:
+        ...     timer = Timer('''
+        ... env.rollout(100)
+        ... ''',
+        ...     setup=f'''
+        ... from torchrl.envs import BraxEnv
+        ... env = BraxEnv("ant", batch_size=[{batch_size}], device="{device}")
+        ... env.set_seed(0)
+        ... env.rollout(2)
+        ... ''')
+        ...     print(batch_size, timer.timeit(10))
+        4
+        env.rollout(100)
+        setup: [...]
+        310.00 ms
+        1 measurement, 10 runs , 1 thread
+        16
+        env.rollout(100)
+        setup: [...]
+        268.46 ms
+        1 measurement, 10 runs , 1 thread
+        128
+        env.rollout(100)
+        setup: [...]
+        433.80 ms
+        1 measurement, 10 runs , 1 thread
+    One can backpropagate through the rollout and optimize the policy directly:
+        >>> from torchrl.envs import BraxEnv
+        >>> from tensordict.nn import TensorDictModule
+        >>> from torch import nn
+        >>> import torch
+        >>>
+        >>> env = BraxEnv("ant", batch_size=[10], requires_grad=True, cache_clear_frequency=100)
+        >>> env.set_seed(0)
+        >>> torch.manual_seed(0)
+        >>> policy = TensorDictModule(nn.Linear(27, 8), in_keys=["observation"], out_keys=["action"])
+        >>>
+        >>> td = env.rollout(10, policy)
+        >>>
+        >>> td["next", "reward"].mean().backward(retain_graph=True)
+        >>> print(policy.module.weight.grad.norm())
+        tensor(213.8605)
+    """
+    def __init__(self, env_name, **kwargs):
+        kwargs["env_name"] = env_name
+        super().__init__(**kwargs)
+    def _build_env(
+        self,
+        env_name: str,
+        **kwargs,
+    ) -> brax.envs.env.Env:  # noqa: F821
+        if not _has_brax:
+            raise ImportError(
+                f"brax not found, unable to create {env_name}. "
+                f"Consider downloading and installing brax from"
+                f" {self.git_url}"
+            )
+        from_pixels = kwargs.pop("from_pixels", False)
+        pixels_only = kwargs.pop("pixels_only", True)
+        requires_grad = kwargs.pop("requires_grad", False)
+        cache_clear_frequency = kwargs.pop("cache_clear_frequency", False)
+        if kwargs:
+            raise ValueError("kwargs not supported.")
+        self.wrapper_frame_skip = 1
+        env = self.lib.envs.get_environment(env_name, **kwargs)
+        return super()._build_env(
+            env,
+            pixels_only=pixels_only,
+            from_pixels=from_pixels,
+            requires_grad=requires_grad,
+            cache_clear_frequency=cache_clear_frequency,
+        )
+    @property
+    def env_name(self):
+        return self._constructor_kwargs["env_name"]
+    def _check_kwargs(self, kwargs: dict):
+        if "env_name" not in kwargs:
+            raise TypeError("Expected 'env_name' to be part of kwargs")
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(env={self.env_name}, batch_size={self.batch_size}, device={self.device})"
+class _BraxEnvStep(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, env: BraxWrapper, state_td, action_tensor, *qp_values):
+        import jax
+        # convert tensors to ndarrays
+        state_obj = _tensordict_to_object(state_td, env._state_example)
+        action_nd = _tensor_to_ndarray(action_tensor)
+        # flatten batch size
+        state = _tree_flatten(state_obj, env.batch_size)
+        action = _tree_flatten(action_nd, env.batch_size)
+        # call vjp with jit and vmap
+        next_state, vjp_fn = jax.vjp(env._vmap_jit_env_step, state, action)
+        # reshape batch size
+        next_state_reshape = _tree_reshape(next_state, env.batch_size)
+        # convert ndarrays to tensors
+        next_state_tensor = _object_to_tensordict(
+            next_state_reshape, device=env.device, batch_size=env.batch_size
+        )
+        # save context
+        ctx.vjp_fn = vjp_fn
+        ctx.next_state = next_state_tensor
+        ctx.env = env
+        # Mark that backward hasn't been called yet
+        ctx._backward_called = False
+        return (
+            next_state_tensor,  # no gradient
+            next_state_tensor["obs"],
+            next_state_tensor["reward"],
+            *next_state_tensor["pipeline_state"].values(),
+        )
+    @staticmethod
+    def backward(ctx, _, grad_next_obs, grad_next_reward, *grad_next_qp_values):
+        # Prevent multiple backward calls on the same context
+        if hasattr(ctx, "_backward_called") and ctx._backward_called:
+            return (None, None, *([None] * len(grad_next_qp_values)))
+        ctx._backward_called = True
+        pipeline_state = dict(
+            zip(ctx.next_state.get("pipeline_state").keys(), grad_next_qp_values)
+        )
+        none_keys = []
+        def _make_none(key, val):
+            if val is not None:
+                return val
+            none_keys.append(key)
+            return torch.zeros_like(ctx.next_state.get(("pipeline_state", key)))
+        pipeline_state = {
+            key: _make_none(key, val) for key, val in pipeline_state.items()
+        }
+        metrics = ctx.next_state.get("metrics", None)
+        if metrics is None:
+            metrics = {}
+        info = ctx.next_state.get("info", None)
+        if info is None:
+            info = {}
+        grad_next_state_td = TensorDict(
+            source={
+                "pipeline_state": pipeline_state,
+                "obs": grad_next_obs,
+                "reward": grad_next_reward,
+                "done": torch.zeros_like(ctx.next_state.get("done")),
+                "metrics": {k: torch.zeros_like(v) for k, v in metrics.items()},
+                "info": {k: torch.zeros_like(v) for k, v in info.items()},
+            },
+            device=ctx.env.device,
+            batch_size=ctx.env.batch_size,
+        )
+        # convert tensors to ndarrays
+        grad_next_state_obj = _tensordict_to_object(
+            grad_next_state_td, ctx.env._state_example
+        )
+        # flatten batch size
+        grad_next_state_flat = _tree_flatten(grad_next_state_obj, ctx.env.batch_size)
+        # call vjp to get gradients
+        grad_state, grad_action = ctx.vjp_fn(grad_next_state_flat)
+        # assert grad_action.device == ctx.env.device
+        # reshape batch size
+        grad_state = _tree_reshape(grad_state, ctx.env.batch_size)
+        grad_action = _tree_reshape(grad_action, ctx.env.batch_size)
+        # assert grad_action.device == ctx.env.device
+        # convert ndarrays to tensors
+        grad_state_qp = _object_to_tensordict(
+            grad_state.pipeline_state,
+            device=ctx.env.device,
+            batch_size=ctx.env.batch_size,
+        )
+        grad_action = _ndarray_to_tensor(grad_action).to(ctx.env.device)
+        grad_state_qp = {
+            key: val if key not in none_keys else None
+            for key, val in grad_state_qp.items()
+        }
+        grads = (grad_action, *grad_state_qp.values())
+        # Clean up context to prevent memory leaks
+        try:
+            # Clear JAX VJP function reference
+            del ctx.vjp_fn
+        except AttributeError:
+            pass
+        try:
+            # Clear stored tensors
+            del ctx.next_state
+        except AttributeError:
+            pass
+        try:
+            # Clear environment reference
+            del ctx.env
+        except AttributeError:
+            pass
+        try:
+            # Clear the backward flag
+            del ctx._backward_called
+        except AttributeError:
+            pass
+        return (None, None, *grads)