PyPI - torchrl - Versions diffs - 0.11.0__cp314-cp314-manylinux_2_28_aarch64.whl - Mend

torchrl 0.11.0__cp314-cp314-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (394) hide show

benchmarks/benchmark_batched_envs.py +104 -0
benchmarks/conftest.py +91 -0
benchmarks/ecosystem/gym_env_throughput.py +321 -0
benchmarks/ecosystem/vmas_rllib_vs_torchrl_sampling_performance.py +231 -0
benchmarks/requirements.txt +7 -0
benchmarks/storage/benchmark_sample_latency_over_rpc.py +193 -0
benchmarks/test_collectors_benchmark.py +240 -0
benchmarks/test_compressed_storage_benchmark.py +145 -0
benchmarks/test_envs_benchmark.py +133 -0
benchmarks/test_llm.py +101 -0
benchmarks/test_non_tensor_env_benchmark.py +70 -0
benchmarks/test_objectives_benchmarks.py +1199 -0
benchmarks/test_replaybuffer_benchmark.py +254 -0
sota-check/README.md +35 -0
sota-implementations/README.md +142 -0
sota-implementations/a2c/README.md +39 -0
sota-implementations/a2c/a2c_atari.py +291 -0
sota-implementations/a2c/a2c_mujoco.py +273 -0
sota-implementations/a2c/utils_atari.py +240 -0
sota-implementations/a2c/utils_mujoco.py +160 -0
sota-implementations/bandits/README.md +7 -0
sota-implementations/bandits/dqn.py +126 -0
sota-implementations/cql/cql_offline.py +198 -0
sota-implementations/cql/cql_online.py +249 -0
sota-implementations/cql/discrete_cql_offline.py +180 -0
sota-implementations/cql/discrete_cql_online.py +227 -0
sota-implementations/cql/utils.py +471 -0
sota-implementations/crossq/crossq.py +271 -0
sota-implementations/crossq/utils.py +320 -0
sota-implementations/ddpg/ddpg.py +231 -0
sota-implementations/ddpg/utils.py +325 -0
sota-implementations/decision_transformer/dt.py +163 -0
sota-implementations/decision_transformer/lamb.py +167 -0
sota-implementations/decision_transformer/online_dt.py +178 -0
sota-implementations/decision_transformer/utils.py +562 -0
sota-implementations/discrete_sac/discrete_sac.py +243 -0
sota-implementations/discrete_sac/utils.py +324 -0
sota-implementations/dqn/README.md +30 -0
sota-implementations/dqn/dqn_atari.py +272 -0
sota-implementations/dqn/dqn_cartpole.py +236 -0
sota-implementations/dqn/utils_atari.py +132 -0
sota-implementations/dqn/utils_cartpole.py +90 -0
sota-implementations/dreamer/README.md +129 -0
sota-implementations/dreamer/dreamer.py +586 -0
sota-implementations/dreamer/dreamer_utils.py +1107 -0
sota-implementations/expert-iteration/README.md +352 -0
sota-implementations/expert-iteration/ei_utils.py +770 -0
sota-implementations/expert-iteration/expert-iteration-async.py +512 -0
sota-implementations/expert-iteration/expert-iteration-sync.py +508 -0
sota-implementations/expert-iteration/requirements_gsm8k.txt +13 -0
sota-implementations/expert-iteration/requirements_ifeval.txt +16 -0
sota-implementations/gail/gail.py +327 -0
sota-implementations/gail/gail_utils.py +68 -0
sota-implementations/gail/ppo_utils.py +157 -0
sota-implementations/grpo/README.md +273 -0
sota-implementations/grpo/grpo-async.py +437 -0
sota-implementations/grpo/grpo-sync.py +435 -0
sota-implementations/grpo/grpo_utils.py +843 -0
sota-implementations/grpo/requirements_gsm8k.txt +11 -0
sota-implementations/grpo/requirements_ifeval.txt +16 -0
sota-implementations/impala/README.md +33 -0
sota-implementations/impala/impala_multi_node_ray.py +292 -0
sota-implementations/impala/impala_multi_node_submitit.py +284 -0
sota-implementations/impala/impala_single_node.py +261 -0
sota-implementations/impala/utils.py +184 -0
sota-implementations/iql/discrete_iql.py +230 -0
sota-implementations/iql/iql_offline.py +164 -0
sota-implementations/iql/iql_online.py +225 -0
sota-implementations/iql/utils.py +437 -0
sota-implementations/multiagent/README.md +74 -0
sota-implementations/multiagent/iql.py +237 -0
sota-implementations/multiagent/maddpg_iddpg.py +266 -0
sota-implementations/multiagent/mappo_ippo.py +267 -0
sota-implementations/multiagent/qmix_vdn.py +271 -0
sota-implementations/multiagent/sac.py +337 -0
sota-implementations/multiagent/utils/__init__.py +4 -0
sota-implementations/multiagent/utils/logging.py +151 -0
sota-implementations/multiagent/utils/utils.py +43 -0
sota-implementations/ppo/README.md +29 -0
sota-implementations/ppo/ppo_atari.py +305 -0
sota-implementations/ppo/ppo_mujoco.py +293 -0
sota-implementations/ppo/utils_atari.py +238 -0
sota-implementations/ppo/utils_mujoco.py +152 -0
sota-implementations/ppo_trainer/train.py +21 -0
sota-implementations/redq/README.md +7 -0
sota-implementations/redq/redq.py +199 -0
sota-implementations/redq/utils.py +1060 -0
sota-implementations/sac/sac-async.py +266 -0
sota-implementations/sac/sac.py +239 -0
sota-implementations/sac/utils.py +381 -0
sota-implementations/sac_trainer/train.py +16 -0
sota-implementations/td3/td3.py +254 -0
sota-implementations/td3/utils.py +319 -0
sota-implementations/td3_bc/td3_bc.py +177 -0
sota-implementations/td3_bc/utils.py +251 -0
torchrl/__init__.py +144 -0
torchrl/_extension.py +74 -0
torchrl/_torchrl.cpython-314-aarch64-linux-gnu.so +0 -0
torchrl/_utils.py +1431 -0
torchrl/collectors/__init__.py +48 -0
torchrl/collectors/_base.py +1058 -0
torchrl/collectors/_constants.py +88 -0
torchrl/collectors/_multi_async.py +324 -0
torchrl/collectors/_multi_base.py +1805 -0
torchrl/collectors/_multi_sync.py +464 -0
torchrl/collectors/_runner.py +581 -0
torchrl/collectors/_single.py +2009 -0
torchrl/collectors/_single_async.py +259 -0
torchrl/collectors/collectors.py +62 -0
torchrl/collectors/distributed/__init__.py +32 -0
torchrl/collectors/distributed/default_configs.py +133 -0
torchrl/collectors/distributed/generic.py +1306 -0
torchrl/collectors/distributed/ray.py +1092 -0
torchrl/collectors/distributed/rpc.py +1006 -0
torchrl/collectors/distributed/sync.py +731 -0
torchrl/collectors/distributed/utils.py +160 -0
torchrl/collectors/llm/__init__.py +10 -0
torchrl/collectors/llm/base.py +494 -0
torchrl/collectors/llm/ray_collector.py +275 -0
torchrl/collectors/llm/utils.py +36 -0
torchrl/collectors/llm/weight_update/__init__.py +10 -0
torchrl/collectors/llm/weight_update/vllm.py +348 -0
torchrl/collectors/llm/weight_update/vllm_v2.py +311 -0
torchrl/collectors/utils.py +433 -0
torchrl/collectors/weight_update.py +591 -0
torchrl/csrc/numpy_utils.h +38 -0
torchrl/csrc/pybind.cpp +27 -0
torchrl/csrc/segment_tree.h +458 -0
torchrl/csrc/torch_utils.h +34 -0
torchrl/csrc/utils.cpp +48 -0
torchrl/csrc/utils.h +31 -0
torchrl/data/__init__.py +187 -0
torchrl/data/datasets/__init__.py +58 -0
torchrl/data/datasets/atari_dqn.py +878 -0
torchrl/data/datasets/common.py +281 -0
torchrl/data/datasets/d4rl.py +489 -0
torchrl/data/datasets/d4rl_infos.py +187 -0
torchrl/data/datasets/gen_dgrl.py +375 -0
torchrl/data/datasets/minari_data.py +643 -0
torchrl/data/datasets/openml.py +177 -0
torchrl/data/datasets/openx.py +798 -0
torchrl/data/datasets/roboset.py +363 -0
torchrl/data/datasets/utils.py +11 -0
torchrl/data/datasets/vd4rl.py +432 -0
torchrl/data/llm/__init__.py +34 -0
torchrl/data/llm/dataset.py +491 -0
torchrl/data/llm/history.py +1378 -0
torchrl/data/llm/prompt.py +198 -0
torchrl/data/llm/reward.py +225 -0
torchrl/data/llm/topk.py +186 -0
torchrl/data/llm/utils.py +543 -0
torchrl/data/map/__init__.py +21 -0
torchrl/data/map/hash.py +185 -0
torchrl/data/map/query.py +204 -0
torchrl/data/map/tdstorage.py +363 -0
torchrl/data/map/tree.py +1434 -0
torchrl/data/map/utils.py +103 -0
torchrl/data/postprocs/__init__.py +8 -0
torchrl/data/postprocs/postprocs.py +391 -0
torchrl/data/replay_buffers/__init__.py +99 -0
torchrl/data/replay_buffers/checkpointers.py +622 -0
torchrl/data/replay_buffers/ray_buffer.py +292 -0
torchrl/data/replay_buffers/replay_buffers.py +2376 -0
torchrl/data/replay_buffers/samplers.py +2578 -0
torchrl/data/replay_buffers/scheduler.py +265 -0
torchrl/data/replay_buffers/storages.py +2412 -0
torchrl/data/replay_buffers/utils.py +1042 -0
torchrl/data/replay_buffers/writers.py +781 -0
torchrl/data/tensor_specs.py +7101 -0
torchrl/data/utils.py +334 -0
torchrl/envs/__init__.py +265 -0
torchrl/envs/async_envs.py +1105 -0
torchrl/envs/batched_envs.py +3093 -0
torchrl/envs/common.py +4241 -0
torchrl/envs/custom/__init__.py +11 -0
torchrl/envs/custom/chess.py +617 -0
torchrl/envs/custom/llm.py +214 -0
torchrl/envs/custom/pendulum.py +401 -0
torchrl/envs/custom/san_moves.txt +29274 -0
torchrl/envs/custom/tictactoeenv.py +288 -0
torchrl/envs/env_creator.py +263 -0
torchrl/envs/gym_like.py +752 -0
torchrl/envs/libs/__init__.py +68 -0
torchrl/envs/libs/_gym_utils.py +326 -0
torchrl/envs/libs/brax.py +846 -0
torchrl/envs/libs/dm_control.py +544 -0
torchrl/envs/libs/envpool.py +447 -0
torchrl/envs/libs/gym.py +2239 -0
torchrl/envs/libs/habitat.py +138 -0
torchrl/envs/libs/isaac_lab.py +87 -0
torchrl/envs/libs/isaacgym.py +203 -0
torchrl/envs/libs/jax_utils.py +166 -0
torchrl/envs/libs/jumanji.py +963 -0
torchrl/envs/libs/meltingpot.py +599 -0
torchrl/envs/libs/openml.py +153 -0
torchrl/envs/libs/openspiel.py +652 -0
torchrl/envs/libs/pettingzoo.py +1042 -0
torchrl/envs/libs/procgen.py +351 -0
torchrl/envs/libs/robohive.py +429 -0
torchrl/envs/libs/smacv2.py +645 -0
torchrl/envs/libs/unity_mlagents.py +891 -0
torchrl/envs/libs/utils.py +147 -0
torchrl/envs/libs/vmas.py +813 -0
torchrl/envs/llm/__init__.py +63 -0
torchrl/envs/llm/chat.py +730 -0
torchrl/envs/llm/datasets/README.md +4 -0
torchrl/envs/llm/datasets/__init__.py +17 -0
torchrl/envs/llm/datasets/gsm8k.py +353 -0
torchrl/envs/llm/datasets/ifeval.py +274 -0
torchrl/envs/llm/envs.py +789 -0
torchrl/envs/llm/libs/README.md +3 -0
torchrl/envs/llm/libs/__init__.py +8 -0
torchrl/envs/llm/libs/mlgym.py +869 -0
torchrl/envs/llm/reward/__init__.py +10 -0
torchrl/envs/llm/reward/gsm8k.py +324 -0
torchrl/envs/llm/reward/ifeval/README.md +13 -0
torchrl/envs/llm/reward/ifeval/__init__.py +10 -0
torchrl/envs/llm/reward/ifeval/_instructions.py +1667 -0
torchrl/envs/llm/reward/ifeval/_instructions_main.py +131 -0
torchrl/envs/llm/reward/ifeval/_instructions_registry.py +100 -0
torchrl/envs/llm/reward/ifeval/_instructions_util.py +1677 -0
torchrl/envs/llm/reward/ifeval/_scorer.py +454 -0
torchrl/envs/llm/transforms/__init__.py +55 -0
torchrl/envs/llm/transforms/browser.py +292 -0
torchrl/envs/llm/transforms/dataloading.py +859 -0
torchrl/envs/llm/transforms/format.py +73 -0
torchrl/envs/llm/transforms/kl.py +1544 -0
torchrl/envs/llm/transforms/policy_version.py +189 -0
torchrl/envs/llm/transforms/reason.py +323 -0
torchrl/envs/llm/transforms/tokenizer.py +321 -0
torchrl/envs/llm/transforms/tools.py +1955 -0
torchrl/envs/model_based/__init__.py +9 -0
torchrl/envs/model_based/common.py +180 -0
torchrl/envs/model_based/dreamer.py +112 -0
torchrl/envs/transforms/__init__.py +147 -0
torchrl/envs/transforms/functional.py +48 -0
torchrl/envs/transforms/gym_transforms.py +203 -0
torchrl/envs/transforms/module.py +341 -0
torchrl/envs/transforms/r3m.py +372 -0
torchrl/envs/transforms/ray_service.py +663 -0
torchrl/envs/transforms/rb_transforms.py +214 -0
torchrl/envs/transforms/transforms.py +11835 -0
torchrl/envs/transforms/utils.py +94 -0
torchrl/envs/transforms/vc1.py +307 -0
torchrl/envs/transforms/vecnorm.py +845 -0
torchrl/envs/transforms/vip.py +407 -0
torchrl/envs/utils.py +1718 -0
torchrl/envs/vec_envs.py +11 -0
torchrl/modules/__init__.py +206 -0
torchrl/modules/distributions/__init__.py +73 -0
torchrl/modules/distributions/continuous.py +830 -0
torchrl/modules/distributions/discrete.py +908 -0
torchrl/modules/distributions/truncated_normal.py +187 -0
torchrl/modules/distributions/utils.py +233 -0
torchrl/modules/llm/__init__.py +62 -0
torchrl/modules/llm/backends/__init__.py +65 -0
torchrl/modules/llm/backends/vllm/__init__.py +94 -0
torchrl/modules/llm/backends/vllm/_models.py +46 -0
torchrl/modules/llm/backends/vllm/base.py +72 -0
torchrl/modules/llm/backends/vllm/vllm_async.py +2075 -0
torchrl/modules/llm/backends/vllm/vllm_plugin.py +22 -0
torchrl/modules/llm/backends/vllm/vllm_sync.py +446 -0
torchrl/modules/llm/backends/vllm/vllm_utils.py +129 -0
torchrl/modules/llm/policies/__init__.py +28 -0
torchrl/modules/llm/policies/common.py +1809 -0
torchrl/modules/llm/policies/transformers_wrapper.py +2756 -0
torchrl/modules/llm/policies/vllm_wrapper.py +2241 -0
torchrl/modules/llm/utils.py +23 -0
torchrl/modules/mcts/__init__.py +21 -0
torchrl/modules/mcts/scores.py +579 -0
torchrl/modules/models/__init__.py +86 -0
torchrl/modules/models/batchrenorm.py +119 -0
torchrl/modules/models/decision_transformer.py +179 -0
torchrl/modules/models/exploration.py +731 -0
torchrl/modules/models/llm.py +156 -0
torchrl/modules/models/model_based.py +596 -0
torchrl/modules/models/models.py +1712 -0
torchrl/modules/models/multiagent.py +1067 -0
torchrl/modules/models/recipes/impala.py +185 -0
torchrl/modules/models/utils.py +162 -0
torchrl/modules/planners/__init__.py +10 -0
torchrl/modules/planners/cem.py +228 -0
torchrl/modules/planners/common.py +73 -0
torchrl/modules/planners/mppi.py +265 -0
torchrl/modules/tensordict_module/__init__.py +89 -0
torchrl/modules/tensordict_module/actors.py +2457 -0
torchrl/modules/tensordict_module/common.py +529 -0
torchrl/modules/tensordict_module/exploration.py +814 -0
torchrl/modules/tensordict_module/probabilistic.py +321 -0
torchrl/modules/tensordict_module/rnn.py +1639 -0
torchrl/modules/tensordict_module/sequence.py +132 -0
torchrl/modules/tensordict_module/world_models.py +34 -0
torchrl/modules/utils/__init__.py +38 -0
torchrl/modules/utils/mappings.py +9 -0
torchrl/modules/utils/utils.py +89 -0
torchrl/objectives/__init__.py +78 -0
torchrl/objectives/a2c.py +659 -0
torchrl/objectives/common.py +753 -0
torchrl/objectives/cql.py +1346 -0
torchrl/objectives/crossq.py +710 -0
torchrl/objectives/ddpg.py +453 -0
torchrl/objectives/decision_transformer.py +371 -0
torchrl/objectives/deprecated.py +516 -0
torchrl/objectives/dqn.py +683 -0
torchrl/objectives/dreamer.py +488 -0
torchrl/objectives/functional.py +48 -0
torchrl/objectives/gail.py +258 -0
torchrl/objectives/iql.py +996 -0
torchrl/objectives/llm/__init__.py +30 -0
torchrl/objectives/llm/grpo.py +846 -0
torchrl/objectives/llm/sft.py +482 -0
torchrl/objectives/multiagent/__init__.py +8 -0
torchrl/objectives/multiagent/qmixer.py +396 -0
torchrl/objectives/ppo.py +1669 -0
torchrl/objectives/redq.py +683 -0
torchrl/objectives/reinforce.py +530 -0
torchrl/objectives/sac.py +1580 -0
torchrl/objectives/td3.py +570 -0
torchrl/objectives/td3_bc.py +625 -0
torchrl/objectives/utils.py +782 -0
torchrl/objectives/value/__init__.py +28 -0
torchrl/objectives/value/advantages.py +1956 -0
torchrl/objectives/value/functional.py +1459 -0
torchrl/objectives/value/utils.py +360 -0
torchrl/record/__init__.py +17 -0
torchrl/record/loggers/__init__.py +23 -0
torchrl/record/loggers/common.py +48 -0
torchrl/record/loggers/csv.py +226 -0
torchrl/record/loggers/mlflow.py +142 -0
torchrl/record/loggers/tensorboard.py +139 -0
torchrl/record/loggers/trackio.py +163 -0
torchrl/record/loggers/utils.py +78 -0
torchrl/record/loggers/wandb.py +214 -0
torchrl/record/recorder.py +554 -0
torchrl/services/__init__.py +79 -0
torchrl/services/base.py +109 -0
torchrl/services/ray_service.py +453 -0
torchrl/testing/__init__.py +107 -0
torchrl/testing/assertions.py +179 -0
torchrl/testing/dist_utils.py +122 -0
torchrl/testing/env_creators.py +227 -0
torchrl/testing/env_helper.py +35 -0
torchrl/testing/gym_helpers.py +156 -0
torchrl/testing/llm_mocks.py +119 -0
torchrl/testing/mocking_classes.py +2720 -0
torchrl/testing/modules.py +295 -0
torchrl/testing/mp_helpers.py +15 -0
torchrl/testing/ray_helpers.py +293 -0
torchrl/testing/utils.py +190 -0
torchrl/trainers/__init__.py +42 -0
torchrl/trainers/algorithms/__init__.py +11 -0
torchrl/trainers/algorithms/configs/__init__.py +705 -0
torchrl/trainers/algorithms/configs/collectors.py +216 -0
torchrl/trainers/algorithms/configs/common.py +41 -0
torchrl/trainers/algorithms/configs/data.py +308 -0
torchrl/trainers/algorithms/configs/envs.py +104 -0
torchrl/trainers/algorithms/configs/envs_libs.py +361 -0
torchrl/trainers/algorithms/configs/logging.py +80 -0
torchrl/trainers/algorithms/configs/modules.py +570 -0
torchrl/trainers/algorithms/configs/objectives.py +177 -0
torchrl/trainers/algorithms/configs/trainers.py +340 -0
torchrl/trainers/algorithms/configs/transforms.py +955 -0
torchrl/trainers/algorithms/configs/utils.py +252 -0
torchrl/trainers/algorithms/configs/weight_sync_schemes.py +191 -0
torchrl/trainers/algorithms/configs/weight_update.py +159 -0
torchrl/trainers/algorithms/ppo.py +373 -0
torchrl/trainers/algorithms/sac.py +308 -0
torchrl/trainers/helpers/__init__.py +40 -0
torchrl/trainers/helpers/collectors.py +416 -0
torchrl/trainers/helpers/envs.py +573 -0
torchrl/trainers/helpers/logger.py +33 -0
torchrl/trainers/helpers/losses.py +132 -0
torchrl/trainers/helpers/models.py +658 -0
torchrl/trainers/helpers/replay_buffer.py +59 -0
torchrl/trainers/helpers/trainers.py +301 -0
torchrl/trainers/trainers.py +2052 -0
torchrl/weight_update/__init__.py +33 -0
torchrl/weight_update/_distributed.py +749 -0
torchrl/weight_update/_mp.py +624 -0
torchrl/weight_update/_noupdate.py +102 -0
torchrl/weight_update/_ray.py +1032 -0
torchrl/weight_update/_rpc.py +284 -0
torchrl/weight_update/_shared.py +891 -0
torchrl/weight_update/llm/__init__.py +32 -0
torchrl/weight_update/llm/vllm_double_buffer.py +370 -0
torchrl/weight_update/llm/vllm_nccl.py +710 -0
torchrl/weight_update/utils.py +73 -0
torchrl/weight_update/weight_sync_schemes.py +1244 -0
torchrl-0.11.0.dist-info/METADATA +1308 -0
torchrl-0.11.0.dist-info/RECORD +394 -0
torchrl-0.11.0.dist-info/WHEEL +5 -0
torchrl-0.11.0.dist-info/entry_points.txt +2 -0
torchrl-0.11.0.dist-info/licenses/LICENSE +21 -0
torchrl-0.11.0.dist-info/top_level.txt +7 -0

torchrl/modules/distributions/discrete.py ADDED Viewed

@@ -0,0 +1,908 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+from collections.abc import Sequence
+from enum import Enum
+from functools import wraps
+import torch
+import torch.distributions as D
+import torch.nn.functional as F
+from tensordict.utils import expand_as_right
+from torch.distributions.utils import lazy_property, logits_to_probs, probs_to_logits
+__all__ = [
+    "OneHotCategorical",
+    "MaskedCategorical",
+    "Ordinal",
+    "OneHotOrdinal",
+    "LLMMaskedCategorical",
+]
+def _treat_categorical_params(
+    params: torch.Tensor | None = None,
+) -> torch.Tensor | None:
+    if params is None:
+        return None
+    if params.shape[-1] == 1:
+        params = params[..., 0]
+    return params
+def rand_one_hot(values: torch.Tensor, do_softmax: bool = True) -> torch.Tensor:
+    if do_softmax:
+        values = values.softmax(-1)
+    out = values.cumsum(-1) > torch.rand_like(values[..., :1])
+    out = (out.cumsum(-1) == 1).to(torch.long)
+    return out
+class _one_hot_wrapper:
+    def __init__(self, parent_dist):
+        self.parent_dist = parent_dist
+    def __call__(self, func):
+        @wraps(func)
+        def wrapped(_self, *args, **kwargs):
+            out = getattr(self.parent_dist, func.__name__)(_self, *args, **kwargs)
+            n = _self.num_samples
+            return torch.nn.functional.one_hot(out, n)
+        return wrapped
+class ReparamGradientStrategy(Enum):
+    PassThrough = 1
+    RelaxedOneHot = 2
+class OneHotCategorical(D.Categorical):
+    """One-hot categorical distribution.
+    This class behaves exactly as torch.distributions.Categorical except that it reads and produces one-hot encodings
+    of the discrete tensors.
+    Args:
+        logits (torch.Tensor): event log probabilities (unnormalized)
+        probs (torch.Tensor): event probabilities
+        grad_method (ReparamGradientStrategy, optional): strategy to gather
+            reparameterized samples.
+            ``ReparamGradientStrategy.PassThrough`` will compute the sample gradients
+            by using the softmax valued log-probability as a proxy to the
+            sample gradients.
+            ``ReparamGradientStrategy.RelaxedOneHot`` will use
+            :class:`torch.distributions.RelaxedOneHot` to sample from the distribution.
+    Examples:
+        >>> torch.manual_seed(0)
+        >>> logits = torch.randn(4)
+        >>> dist = OneHotCategorical(logits=logits)
+        >>> print(dist.rsample((3,)))
+        tensor([[1., 0., 0., 0.],
+                [0., 0., 0., 1.],
+                [1., 0., 0., 0.]])
+    """
+    num_params: int = 1
+    # This is to make the compiler happy, see https://github.com/pytorch/pytorch/issues/140266
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs)
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits)
+    def __init__(
+        self,
+        logits: torch.Tensor | None = None,
+        probs: torch.Tensor | None = None,
+        grad_method: ReparamGradientStrategy = ReparamGradientStrategy.PassThrough,
+        **kwargs,
+    ) -> None:
+        logits = _treat_categorical_params(logits)
+        probs = _treat_categorical_params(probs)
+        self.grad_method = grad_method
+        super().__init__(probs=probs, logits=logits, **kwargs)
+        # Get num_samples from logits or probs shape
+        if logits is not None:
+            self.num_samples = logits.shape[-1]
+        else:
+            self.num_samples = probs.shape[-1]
+    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
+        return super().log_prob(value.argmax(dim=-1))
+    @property
+    def mode(self) -> torch.Tensor:
+        if hasattr(self, "logits"):
+            return (self.logits == self.logits.max(-1, True)[0]).to(torch.long)
+        else:
+            return (self.probs == self.probs.max(-1, True)[0]).to(torch.long)
+    @property
+    def deterministic_sample(self):
+        return self.mode
+    def entropy(self):
+        min_real = torch.finfo(self.logits.dtype).min
+        logits = torch.clamp(self.logits, min=min_real)
+        p_log_p = logits * self.probs
+        return -p_log_p.sum(-1)
+    @_one_hot_wrapper(D.Categorical)
+    def sample(self, sample_shape: torch.Size | Sequence | None = None) -> torch.Tensor:
+        ...
+    def rsample(self, sample_shape: torch.Size | Sequence = None) -> torch.Tensor:
+        if sample_shape is None:
+            sample_shape = torch.Size([])
+        if hasattr(self, "logits") and self.logits is not None:
+            logits = self.logits
+            probs = None
+        else:
+            logits = None
+            probs = self.probs
+        if self.grad_method == ReparamGradientStrategy.RelaxedOneHot:
+            d = D.relaxed_categorical.RelaxedOneHotCategorical(
+                1.0, probs=probs, logits=logits
+            )
+            out = d.rsample(sample_shape)
+            out.data.copy_((out == out.max(-1)[0].unsqueeze(-1)).to(out.dtype))
+            return out
+        elif self.grad_method == ReparamGradientStrategy.PassThrough:
+            if logits is not None:
+                probs = self.probs
+            else:
+                probs = torch.softmax(self.logits, dim=-1)
+            out = self.sample(sample_shape)
+            out = out + probs - probs.detach()
+            return out
+        else:
+            raise ValueError(
+                f"Unknown reparameterization strategy {self.reparam_strategy}."
+            )
+class MaskedCategorical(D.Categorical):
+    """MaskedCategorical distribution.
+    Reference:
+    https://www.tensorflow.org/agents/api_docs/python/tf_agents/distributions/masked/MaskedCategorical
+    Args:
+        logits (torch.Tensor): event log probabilities (unnormalized)
+        probs (torch.Tensor): event probabilities. If provided, the probabilities
+            corresponding to masked items will be zeroed and the probability
+            re-normalized along its last dimension.
+    Keyword Args:
+        mask (torch.Tensor): A boolean mask of the same shape as ``logits``/``probs``
+            where ``False`` entries are the ones to be masked. Alternatively,
+            if ``sparse_mask`` is True, it represents the list of valid indices
+            in the distribution. Exclusive with ``indices``.
+        indices (torch.Tensor): A dense index tensor representing which actions
+            must be taken into account. Exclusive with ``mask``.
+        neg_inf (:obj:`float`, optional): The log-probability value allocated to
+            invalid (out-of-mask) indices. Defaults to -inf.
+        padding_value: The padding value in the mask tensor. When
+            sparse_mask == True, the padding_value will be ignored.
+        use_cross_entropy (bool, optional): For faster computation of the log-probability,
+            the cross_entropy loss functional can be used. Defaults to ``True``.
+        padding_side (str, optional): The side of the padding. Defaults to ``"left"``.
+    Examples:
+        >>> torch.manual_seed(0)
+        >>> logits = torch.randn(4) / 100  # almost equal probabilities
+        >>> mask = torch.tensor([True, False, True, True])
+        >>> dist = MaskedCategorical(logits=logits, mask=mask)
+        >>> sample = dist.sample((10,))
+        >>> print(sample)  # no `1` in the sample
+        tensor([2, 3, 0, 2, 2, 0, 2, 0, 2, 2])
+        >>> print(dist.log_prob(sample))
+        tensor([-1.1203, -1.0928, -1.0831, -1.1203, -1.1203, -1.0831, -1.1203, -1.0831,
+                -1.1203, -1.1203])
+        >>> print(dist.log_prob(torch.ones_like(sample)))
+        tensor([-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf])
+        >>> # with probabilities
+        >>> prob = torch.ones(10)
+        >>> prob = prob / prob.sum()
+        >>> mask = torch.tensor([False] + 9 * [True])  # first outcome is masked
+        >>> dist = MaskedCategorical(probs=prob, mask=mask)
+        >>> print(dist.log_prob(torch.arange(10)))
+        tensor([   -inf, -2.1972, -2.1972, -2.1972, -2.1972, -2.1972, -2.1972, -2.1972,
+                -2.1972, -2.1972])
+    """
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs)
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits)
+    def __init__(
+        self,
+        logits: torch.Tensor | None = None,
+        probs: torch.Tensor | None = None,
+        *,
+        mask: torch.Tensor | None = None,
+        indices: torch.Tensor | None = None,
+        neg_inf: float = float("-inf"),
+        padding_value: int | None = None,
+        use_cross_entropy: bool = True,
+        padding_side: str = "left",
+    ) -> None:
+        if not ((mask is None) ^ (indices is None)):
+            raise ValueError(
+                f"A ``mask`` or some ``indices`` must be provided for {type(self)}, but not both."
+            )
+        if mask is None:
+            mask = indices
+            sparse_mask = True
+        else:
+            sparse_mask = False
+        if probs is not None:
+            if logits is not None:
+                raise ValueError(
+                    "Either `probs` or `logits` must be specified, but not both."
+                )
+            # unnormalized logits
+            probs = probs.clone()
+            if mask.dtype == torch.bool:
+                probs[~mask] = 0
+            else:
+                probs = torch.scatter(
+                    torch.zeros_like(probs), -1, indices, probs.gather(-1, indices)
+                )
+            probs = probs / probs.sum(-1, keepdim=True)
+            logits = probs.log()
+        num_samples = logits.shape[-1]
+        self.use_cross_entropy = use_cross_entropy
+        logits = self._mask_logits(
+            logits,
+            mask,
+            neg_inf=neg_inf,
+            sparse_mask=sparse_mask,
+            padding_value=padding_value,
+        )
+        self.neg_inf = neg_inf
+        self._mask = mask
+        self._sparse_mask = sparse_mask
+        self._padding_value = padding_value
+        self._padding_side = padding_side
+        super().__init__(logits=logits)
+        self.num_samples = num_samples
+    @property
+    def padding_value(self):
+        """Padding value of the distribution mask.
+        If the padding value is not set, it will be inferred from the logits.
+        """
+        return self._padding_value if self._padding_value is not None else 0
+    @property
+    def padding_side(self):
+        return self._padding_side
+    @property
+    def mask(self):
+        if self._sparse_mask:
+            raise ValueError("MaskedCategorical.mask does not support sparse masks")
+        return self._mask
+    def entropy(self):
+        """Compute the entropy of the distribution.
+        For masked distributions, we only consider the entropy over the valid (unmasked) outcomes.
+        Invalid outcomes have zero probability and don't contribute to entropy.
+        """
+        min_real = torch.finfo(self.logits.dtype).min
+        # Clamp logits to avoid numerical issues
+        logits = self.logits
+        if self._mask.dtype is torch.bool:
+            mask = expand_as_right(self._mask, logits)
+            mask = (~mask) | (~logits.isfinite())
+            logits = torch.masked_fill(logits, mask, min_real)
+        else:
+            # logits are already masked
+            pass
+        logits = logits - logits.logsumexp(-1, keepdim=True)
+        # Get probabilities and mask them
+        probs = logits.exp()
+        # Compute entropy only for valid outcomes
+        p_log_p = logits * probs
+        return -p_log_p.sum(-1)
+    def sample(
+        self, sample_shape: torch.Size | Sequence[int] | None = None
+    ) -> torch.Tensor:
+        if sample_shape is None:
+            sample_shape = torch.Size()
+        else:
+            sample_shape = torch.Size(sample_shape)
+        ret = super().sample(sample_shape)
+        if not self._sparse_mask:
+            return ret
+        size = ret.size()
+        outer_dim = sample_shape.numel()
+        inner_dim = self._mask.shape[:-1].numel()
+        idx_3d = self._mask.expand(outer_dim, inner_dim, -1)
+        ret = idx_3d.gather(dim=-1, index=ret.view(outer_dim, inner_dim, 1))
+        return ret.reshape(size)
+    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
+        if not self._sparse_mask:
+            if self.use_cross_entropy:
+                logits = self.logits
+                if logits.ndim > 2:
+                    # Bring channels in 2nd dim
+                    logits = logits.permute(0, -1, *range(1, logits.ndim - 1))
+                original_value_shape = None
+                if logits.ndim == 1 and value.ndim >= 1:
+                    if value.ndim >= 2:
+                        original_value_shape = value.shape
+                        value = value.flatten()
+                    logits = logits.unsqueeze(0).expand(value.shape + logits.shape)
+                result = -torch.nn.functional.cross_entropy(logits, value, reduce=False)
+                if original_value_shape is not None:
+                    result = result.unflatten(0, original_value_shape)
+            else:
+                result = super().log_prob(value)
+            result = torch.where(torch.isfinite(result), result, self.neg_inf)
+            return result
+        idx_3d = self._mask.view(1, -1, self._num_events)
+        val_3d = value.view(-1, idx_3d.size(1), 1)
+        mask = idx_3d == val_3d
+        idx = mask.int().argmax(dim=-1, keepdim=True)
+        idx = idx.view_as(value)
+        if self.use_cross_entropy:
+            logits = self.logits
+            if logits.ndim > 2:
+                # Bring channels in 2nd dim
+                logits = logits.transpose(-1, 1)
+            # possible shapes:
+            # Don't work with cross_entropy (missing batch dimension)
+            # logits.shape = (C,) and idx.shape = (B,)
+            # logits.shape = (C,) and idx.shape = (B0, B1, ...) => requires flattening of idx, only one batch dimension
+            # work with cross_entropy:
+            # logits.shape = (B, C) and idx.shape = (B,)
+            # logits.shape = (B, C, d1, d2, ...) and idx.shape = (B, d1, d2, ...)
+            original_idx_shape = None
+            if logits.ndim == 1 and idx.ndim >= 1:
+                if idx.ndim >= 2:
+                    original_idx_shape = idx.shape
+                    idx = idx.flatten()
+                logits = logits.unsqueeze(0).expand(idx.shape + logits.shape)
+            ret = -torch.nn.functional.cross_entropy(logits, idx, reduce=False)
+            if original_idx_shape is not None:
+                ret = ret.unflatten(0, original_idx_shape)
+        else:
+            ret = super().log_prob(idx)
+        # Fill masked values with neg_inf.
+        ret = ret.view_as(val_3d)
+        ret = ret.masked_fill(
+            torch.logical_not(mask.any(dim=-1, keepdim=True)), self.neg_inf
+        )
+        return ret.view_as(value)
+    @staticmethod
+    def _mask_logits(
+        logits: torch.Tensor,
+        mask: torch.Tensor | None = None,
+        neg_inf: float = float("-inf"),
+        sparse_mask: bool = False,
+        padding_value: int | None = None,
+    ) -> torch.Tensor:
+        if mask is None:
+            return logits
+        if not sparse_mask:
+            return logits.masked_fill(~mask, neg_inf)
+        if padding_value is not None:
+            padding_mask = mask == padding_value
+            if padding_value != 0:
+                # Avoid invalid indices in mask.
+                mask = mask.masked_fill(padding_mask, 0)
+        logits = logits.gather(dim=-1, index=mask)
+        if padding_value is not None:
+            logits.masked_fill_(padding_mask, neg_inf)
+        return logits
+    @property
+    def deterministic_sample(self):
+        return self.mode
+class MaskedOneHotCategorical(MaskedCategorical):
+    """MaskedCategorical distribution.
+    Reference:
+    https://www.tensorflow.org/agents/api_docs/python/tf_agents/distributions/masked/MaskedCategorical
+    Args:
+        logits (torch.Tensor): event log probabilities (unnormalized)
+        probs (torch.Tensor): event probabilities. If provided, the probabilities
+            corresponding to masked items will be zeroed and the probability
+            re-normalized along its last dimension.
+    Keyword Args:
+        mask (torch.Tensor): A boolean mask of the same shape as ``logits``/``probs``
+            where ``False`` entries are the ones to be masked. Alternatively,
+            if ``sparse_mask`` is True, it represents the list of valid indices
+            in the distribution. Exclusive with ``indices``.
+        indices (torch.Tensor): A dense index tensor representing which actions
+            must be taken into account. Exclusive with ``mask``.
+        neg_inf (:obj:`float`, optional): The log-probability value allocated to
+            invalid (out-of-mask) indices. Defaults to -inf.
+        padding_value: The padding value in then mask tensor when
+            sparse_mask == True, the padding_value will be ignored.
+        grad_method (ReparamGradientStrategy, optional): strategy to gather
+            reparameterized samples.
+            ``ReparamGradientStrategy.PassThrough`` will compute the sample gradients
+             by using the softmax valued log-probability as a proxy to the
+             samples gradients.
+            ``ReparamGradientStrategy.RelaxedOneHot`` will use
+            :class:`torch.distributions.RelaxedOneHot` to sample from the distribution.
+    Examples:
+        >>> torch.manual_seed(0)
+        >>> logits = torch.randn(4) / 100  # almost equal probabilities
+        >>> mask = torch.tensor([True, False, True, True])
+        >>> dist = MaskedOneHotCategorical(logits=logits, mask=mask)
+        >>> sample = dist.sample((10,))
+        >>> print(sample)  # no `1` in the sample
+        tensor([[0, 0, 1, 0],
+                [0, 0, 0, 1],
+                [1, 0, 0, 0],
+                [0, 0, 1, 0],
+                [0, 0, 1, 0],
+                [1, 0, 0, 0],
+                [0, 0, 1, 0],
+                [1, 0, 0, 0],
+                [0, 0, 1, 0],
+                [0, 0, 1, 0]])
+        >>> print(dist.log_prob(sample))
+        tensor([-1.1203, -1.0928, -1.0831, -1.1203, -1.1203, -1.0831, -1.1203, -1.0831,
+                -1.1203, -1.1203])
+        >>> sample_non_valid = torch.zeros_like(sample)
+        >>> sample_non_valid[..., 1] = 1
+        >>> print(dist.log_prob(sample_non_valid))
+        tensor([-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf])
+        >>> # with probabilities
+        >>> prob = torch.ones(10)
+        >>> prob = prob / prob.sum()
+        >>> mask = torch.tensor([False] + 9 * [True])  # first outcome is masked
+        >>> dist = MaskedOneHotCategorical(probs=prob, mask=mask)
+        >>> s = torch.arange(10)
+        >>> s = torch.nn.functional.one_hot(s, 10)
+        >>> print(dist.log_prob(s))
+        tensor([   -inf, -2.1972, -2.1972, -2.1972, -2.1972, -2.1972, -2.1972, -2.1972,
+                -2.1972, -2.1972])
+    """
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs)
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits)
+    def __init__(
+        self,
+        logits: torch.Tensor | None = None,
+        probs: torch.Tensor | None = None,
+        mask: torch.Tensor = None,
+        indices: torch.Tensor = None,
+        neg_inf: float = float("-inf"),
+        padding_value: int | None = None,
+        grad_method: ReparamGradientStrategy = ReparamGradientStrategy.PassThrough,
+    ) -> None:
+        self.grad_method = grad_method
+        super().__init__(
+            logits=logits,
+            probs=probs,
+            mask=mask,
+            indices=indices,
+            neg_inf=neg_inf,
+            padding_value=padding_value,
+        )
+    @_one_hot_wrapper(MaskedCategorical)
+    def sample(
+        self, sample_shape: torch.Size | Sequence[int] | None = None
+    ) -> torch.Tensor:
+        ...
+    @property
+    def deterministic_sample(self):
+        return self.mode
+    @property
+    def mode(self) -> torch.Tensor:
+        if hasattr(self, "logits"):
+            return (self.logits == self.logits.max(-1, True)[0]).to(torch.long)
+        else:
+            return (self.probs == self.probs.max(-1, True)[0]).to(torch.long)
+    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
+        return super().log_prob(value.argmax(dim=-1))
+    def rsample(self, sample_shape: torch.Size | Sequence = None) -> torch.Tensor:
+        if sample_shape is None:
+            sample_shape = torch.Size([])
+        if hasattr(self, "logits") and self.logits is not None:
+            logits = self.logits
+            probs = None
+        else:
+            logits = None
+            probs = self.probs
+        if self.grad_method == ReparamGradientStrategy.RelaxedOneHot:
+            if self._sparse_mask:
+                if probs is not None:
+                    probs_extended = torch.full(
+                        (*probs.shape[:-1], self.num_samples),
+                        0,
+                        device=probs.device,
+                        dtype=probs.dtype,
+                    )
+                    probs_extended = torch.scatter(
+                        probs_extended, -1, self._mask, probs
+                    )
+                    logits_extended = None
+                else:
+                    probs_extended = torch.full(
+                        (*logits.shape[:-1], self.num_samples),
+                        self.neg_inf,
+                        device=logits.device,
+                        dtype=logits.dtype,
+                    )
+                    logits_extended = torch.scatter(
+                        probs_extended, -1, self._mask, logits
+                    )
+                    probs_extended = None
+            else:
+                probs_extended = probs
+                logits_extended = logits
+            d = D.relaxed_categorical.RelaxedOneHotCategorical(
+                1.0, probs=probs_extended, logits=logits_extended
+            )
+            out = d.rsample(sample_shape)
+            out.data.copy_((out == out.max(-1)[0].unsqueeze(-1)).to(out.dtype))
+            return out
+        elif self.grad_method == ReparamGradientStrategy.PassThrough:
+            if logits is not None:
+                probs = self.probs
+            else:
+                probs = torch.softmax(self.logits, dim=-1)
+            if self._sparse_mask:
+                probs_extended = torch.full(
+                    (*probs.shape[:-1], self.num_samples),
+                    0,
+                    device=probs.device,
+                    dtype=probs.dtype,
+                )
+                probs_extended = torch.scatter(probs_extended, -1, self._mask, probs)
+            else:
+                probs_extended = probs
+            out = self.sample(sample_shape)
+            out = out + probs_extended - probs_extended.detach()
+            return out
+        else:
+            raise ValueError(
+                f"Unknown reparameterization strategy {self.reparam_strategy}."
+            )
+class Ordinal(D.Categorical):
+    """A discrete distribution for learning to sample from finite ordered sets.
+    It is defined in contrast with the `Categorical` distribution, which does
+    not impose any notion of proximity or ordering over its support's atoms.
+    The `Ordinal` distribution explicitly encodes those concepts, which is
+    useful for learning discrete sampling from continuous sets. See §5 of
+    `Tang & Agrawal, 2020 <https://arxiv.org/pdf/1901.10500.pdf>`_ for details.
+    .. note::
+        This class is mostly useful when you want to learn a distribution over
+        a finite set which is obtained by discretising a continuous set.
+    Args:
+        scores (torch.Tensor): a tensor of shape [..., N] where N is the size of the set which supports the distributions.
+            Typically, the output of a neural network parametrising the distribution.
+    Examples:
+        >>> num_atoms, num_samples = 5, 20
+        >>> mean = (num_atoms - 1) / 2  # Target mean for samples, centered around the middle atom
+        >>> torch.manual_seed(42)
+        >>> logits = torch.ones((num_atoms), requires_grad=True)
+        >>> optimizer = torch.optim.Adam([logits], lr=0.1)
+        >>>
+        >>> # Perform optimisation loop to minimise deviation from `mean`
+        >>> for _ in range(20):
+        >>>     sampler = Ordinal(scores=logits)
+        >>>     samples = sampler.sample((num_samples,))
+        >>>     # Define loss to encourage samples around the mean by penalising deviation from mean
+        >>>     loss = torch.mean((samples - mean) ** 2 * sampler.log_prob(samples))
+        >>>     loss.backward()
+        >>>     optimizer.step()
+        >>>     optimizer.zero_grad()
+        >>>
+        >>> sampler.probs
+        tensor([0.0308, 0.1586, 0.4727, 0.2260, 0.1120], ...)
+        >>> # Print histogram to observe sample distribution frequency across 5 bins (0, 1, 2, 3, and 4)
+        >>> torch.histogram(sampler.sample((1000,)).reshape(-1).float(), bins=num_atoms)
+        torch.return_types.histogram(
+            hist=tensor([ 24., 158., 478., 228., 112.]),
+            bin_edges=tensor([0.0000, 0.8000, 1.6000, 2.4000, 3.2000, 4.0000]))
+    """
+    def __init__(self, scores: torch.Tensor):
+        logits = _generate_ordinal_logits(scores)
+        super().__init__(logits=logits)
+class OneHotOrdinal(OneHotCategorical):
+    """The one-hot version of the :class:`~tensordict.nn.distributions.Ordinal` distribution.
+    Args:
+        scores (torch.Tensor): a tensor of shape [..., N] where N is the size of the set which supports the distributions.
+            Typically, the output of a neural network parametrising the distribution.
+    """
+    def __init__(self, scores: torch.Tensor):
+        logits = _generate_ordinal_logits(scores)
+        super().__init__(logits=logits)
+def _generate_ordinal_logits(scores: torch.Tensor) -> torch.Tensor:
+    """Implements Eq. 4 of `Tang & Agrawal, 2020<https://arxiv.org/pdf/1901.10500.pdf>`__."""
+    # Assigns Bernoulli-like probabilities for each class in the set
+    log_probs = F.logsigmoid(scores)
+    complementary_log_probs = F.logsigmoid(-scores)
+    # Total log-probability for being "larger than k"
+    larger_than_log_probs = log_probs.cumsum(dim=-1)
+    # Total log-probability for being "smaller than k"
+    smaller_than_log_probs = (
+        complementary_log_probs.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1])
+        - complementary_log_probs
+    )
+    return larger_than_log_probs + smaller_than_log_probs
+class LLMMaskedCategorical(D.Distribution):
+    """LLM-optimized masked categorical distribution.
+    This class provides a more memory-efficient approach for LLM training by:
+    1. Using ignore_index=-100 for log_prob computation (no masking overhead)
+    2. Using traditional masking for sampling operations
+    This is particularly beneficial for large vocabulary sizes where masking
+    all logits can be memory-intensive.
+    Args:
+        logits (torch.Tensor): Event log probabilities (unnormalized), shape [B, T, C].
+            - *B*: batch size (optional)
+            - T: sequence length
+            - C: vocabulary size (number of classes)
+        mask (torch.Tensor): Boolean mask indicating valid positions/tokens.
+            - If shape [*B, T]: position-level masking. True means the position is valid (all tokens allowed).
+            - If shape [*B, T, C]: token-level masking. True means the token is valid at that position.
+               .. warning:: Token-level masking is considerably more memory-intensive than position-level masking.
+                   Only use this if you need to mask tokens.
+        ignore_index (int, optional): Index to ignore in log_prob computation. Defaults to -100.
+    Input shapes:
+        - logits: [*B, T, C] (required)
+        - mask: [*B, T] (position-level) or [*B, T, C] (token-level)
+        - tokens (for log_prob): [*B, T] (token indices, with ignore_index for masked positions)
+    Use cases:
+        1. **Position-level masking**
+            >>> logits = torch.randn(2, 10, 50000)  # [B=2, T=10, C=50000]
+            >>> mask = torch.ones(2, 10, dtype=torch.bool)  # [B, T]
+            >>> mask[0, :5] = False  # mask first 5 positions of first sequence
+            >>> dist = LLMMaskedCategorical(logits=logits, mask=mask)
+            >>> tokens = torch.randint(0, 50000, (2, 10))  # [B, T]
+            >>> tokens[0, :5] = -100  # set masked positions to ignore_index
+            >>> log_probs = dist.log_prob(tokens)
+            >>> samples = dist.sample()  # [B, T]
+        2. **Token-level masking**
+            >>> logits = torch.randn(2, 10, 50000)
+            >>> mask = torch.ones(2, 10, 50000, dtype=torch.bool)  # [B, T, C]
+            >>> mask[0, :5, :1000] = False  # mask first 1000 tokens for first 5 positions
+            >>> dist = LLMMaskedCategorical(logits=logits, mask=mask)
+            >>> tokens = torch.randint(0, 50000, (2, 10))
+            >>> # Optionally, set tokens at fully-masked positions to ignore_index
+            >>> log_probs = dist.log_prob(tokens)
+            >>> samples = dist.sample()  # [B, T]
+    Notes:
+        - For log_prob, tokens must be of shape [B, T] and contain valid token indices (0 <= token < C), or ignore_index for masked/ignored positions.
+        - For token-level masking, if a token is masked at a given position, log_prob will return -inf for that entry.
+        - For position-level masking, if a position is masked (ignore_index), log_prob will return 0.0 for that entry (correct for cross-entropy loss).
+        - Sampling always respects the mask (masked tokens/positions are never sampled).
+    All documented use cases are covered by tests in test_distributions.py.
+    """
+    def __init__(
+        self,
+        logits: torch.Tensor,
+        mask: torch.Tensor,
+        ignore_index: int = -100,
+    ) -> None:
+        # Validate shapes
+        if logits.shape[:-1] != mask.shape and logits.shape != mask.shape:
+            raise ValueError(
+                f"Mask shape {mask.shape} must be either logits batch shape {logits.shape[:-1]} "
+                f"(for position-level masking) or logits shape {logits.shape} "
+                f"(for token-level masking)"
+            )
+        self._original_logits = logits
+        self._mask = mask
+        self.ignore_index = ignore_index
+        self._position_level_masking = mask.shape == logits.shape[:-1]
+        # Create masked logits for sampling (only when needed)
+        self._masked_logits = None
+        self._masked_dist = None
+        # Set up distribution properties
+        batch_shape = logits.shape[:-1]
+        event_shape = logits.shape[-1:]
+        super().__init__(batch_shape=batch_shape, event_shape=event_shape)
+    @property
+    def _sampling_logits(self):
+        """Get masked logits for sampling operations."""
+        if self._masked_logits is None:
+            # Only create masked logits when needed for sampling
+            large_neg = torch.finfo(self._original_logits.dtype).min
+            if self._position_level_masking:
+                # Position-level masking: expand mask to match logits shape
+                mask_expanded = expand_as_right(self._mask, self._original_logits)
+                self._masked_logits = self._original_logits.masked_fill(
+                    ~mask_expanded, large_neg
+                )
+            else:
+                # Token-level masking: direct masking
+                self._masked_logits = self._original_logits.masked_fill(
+                    ~self._mask, large_neg
+                )
+        return self._masked_logits
+    @property
+    def _sampling_dist(self):
+        """Get masked distribution for sampling operations."""
+        if self._masked_dist is None:
+            self._masked_dist = D.Categorical(logits=self._sampling_logits)
+        return self._masked_dist
+    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
+        """Compute log probabilities using ignore_index approach.
+        This is memory-efficient as it doesn't require masking the logits.
+        The value tensor should use ignore_index for masked positions.
+        """
+        if not self._position_level_masking:
+            logits = self.masked_logits
+        else:
+            # Use cross_entropy with ignore_index for efficiency
+            # For position-level masking, keep the default behavior (0.0 for ignore_index)
+            # This is correct for cross-entropy loss computation
+            # For token-level masking, we need to check if specific tokens are masked
+            logits = self._original_logits
+            value = value.masked_fill(~self._mask, self.ignore_index)
+        if value.ndim > 1:
+            # Reshape for cross_entropy: (batch, seq_len, vocab) -> (batch*seq_len, vocab)
+            logits_flat = logits.reshape(-1, logits.size(-1))
+            value_flat = value.reshape(-1)
+            # Compute cross_entropy with ignore_index
+            log_probs_flat = -F.cross_entropy(
+                logits_flat, value_flat, reduce=False, ignore_index=self.ignore_index
+            )
+            # Reshape back
+            log_probs = log_probs_flat.reshape_as(value)
+        else:
+            log_probs = -F.cross_entropy(
+                logits,
+                value,
+                reduce=False,
+                ignore_index=self.ignore_index,
+            )
+        return log_probs
+    def sample(
+        self, sample_shape: torch.Size | Sequence[int] | None = None
+    ) -> torch.Tensor:
+        """Sample from the distribution using masked logits."""
+        if sample_shape is None:
+            sample_shape = torch.Size()
+        return self._sampling_dist.sample(sample_shape)
+    def rsample(
+        self, sample_shape: torch.Size | Sequence[int] | None = None
+    ) -> torch.Tensor:
+        """Reparameterized sampling using masked logits."""
+        # This would need to be implemented based on the specific reparameterization strategy
+        # For now, fall back to regular sampling
+        return self.sample(sample_shape)
+    @property
+    def mode(self) -> torch.Tensor:
+        """Get the mode using masked logits."""
+        masked_logits = self._sampling_logits
+        return masked_logits.argmax(dim=-1)
+    def entropy(self) -> torch.Tensor:
+        """Compute entropy using masked logits."""
+        return self._sampling_dist.entropy()
+    def clear_cache(self):
+        """Clear cached masked tensors to free memory."""
+        self._masked_logits = None
+        self._masked_dist = None
+    @property
+    def mask(self) -> torch.Tensor:
+        """Get the mask."""
+        return self._mask
+    @property
+    def logits(self) -> torch.Tensor:
+        """Get the original logits."""
+        return self._original_logits
+    @property
+    def probs(self) -> torch.Tensor:
+        """Get probabilities from original logits."""
+        return torch.softmax(self._original_logits, dim=-1)
+    @property
+    def masked_logits(self) -> torch.Tensor:
+        """Get the masked logits for sampling operations."""
+        return self._sampling_logits
+    @property
+    def masked_dist(self) -> D.Categorical:
+        """Get the masked distribution for sampling operations."""
+        return self._sampling_dist
+    @property
+    def position_level_masking(self) -> bool:
+        """Whether the mask is position-level (True) or token-level (False)."""
+        return self._position_level_masking