torchrl 0.11.0__cp314-cp314t-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (395) hide show
  1. benchmarks/benchmark_batched_envs.py +104 -0
  2. benchmarks/conftest.py +91 -0
  3. benchmarks/ecosystem/gym_env_throughput.py +321 -0
  4. benchmarks/ecosystem/vmas_rllib_vs_torchrl_sampling_performance.py +231 -0
  5. benchmarks/requirements.txt +7 -0
  6. benchmarks/storage/benchmark_sample_latency_over_rpc.py +193 -0
  7. benchmarks/test_collectors_benchmark.py +240 -0
  8. benchmarks/test_compressed_storage_benchmark.py +145 -0
  9. benchmarks/test_envs_benchmark.py +133 -0
  10. benchmarks/test_llm.py +101 -0
  11. benchmarks/test_non_tensor_env_benchmark.py +70 -0
  12. benchmarks/test_objectives_benchmarks.py +1199 -0
  13. benchmarks/test_replaybuffer_benchmark.py +254 -0
  14. sota-check/README.md +35 -0
  15. sota-implementations/README.md +142 -0
  16. sota-implementations/a2c/README.md +39 -0
  17. sota-implementations/a2c/a2c_atari.py +291 -0
  18. sota-implementations/a2c/a2c_mujoco.py +273 -0
  19. sota-implementations/a2c/utils_atari.py +240 -0
  20. sota-implementations/a2c/utils_mujoco.py +160 -0
  21. sota-implementations/bandits/README.md +7 -0
  22. sota-implementations/bandits/dqn.py +126 -0
  23. sota-implementations/cql/cql_offline.py +198 -0
  24. sota-implementations/cql/cql_online.py +249 -0
  25. sota-implementations/cql/discrete_cql_offline.py +180 -0
  26. sota-implementations/cql/discrete_cql_online.py +227 -0
  27. sota-implementations/cql/utils.py +471 -0
  28. sota-implementations/crossq/crossq.py +271 -0
  29. sota-implementations/crossq/utils.py +320 -0
  30. sota-implementations/ddpg/ddpg.py +231 -0
  31. sota-implementations/ddpg/utils.py +325 -0
  32. sota-implementations/decision_transformer/dt.py +163 -0
  33. sota-implementations/decision_transformer/lamb.py +167 -0
  34. sota-implementations/decision_transformer/online_dt.py +178 -0
  35. sota-implementations/decision_transformer/utils.py +562 -0
  36. sota-implementations/discrete_sac/discrete_sac.py +243 -0
  37. sota-implementations/discrete_sac/utils.py +324 -0
  38. sota-implementations/dqn/README.md +30 -0
  39. sota-implementations/dqn/dqn_atari.py +272 -0
  40. sota-implementations/dqn/dqn_cartpole.py +236 -0
  41. sota-implementations/dqn/utils_atari.py +132 -0
  42. sota-implementations/dqn/utils_cartpole.py +90 -0
  43. sota-implementations/dreamer/README.md +129 -0
  44. sota-implementations/dreamer/dreamer.py +586 -0
  45. sota-implementations/dreamer/dreamer_utils.py +1107 -0
  46. sota-implementations/expert-iteration/README.md +352 -0
  47. sota-implementations/expert-iteration/ei_utils.py +770 -0
  48. sota-implementations/expert-iteration/expert-iteration-async.py +512 -0
  49. sota-implementations/expert-iteration/expert-iteration-sync.py +508 -0
  50. sota-implementations/expert-iteration/requirements_gsm8k.txt +13 -0
  51. sota-implementations/expert-iteration/requirements_ifeval.txt +16 -0
  52. sota-implementations/gail/gail.py +327 -0
  53. sota-implementations/gail/gail_utils.py +68 -0
  54. sota-implementations/gail/ppo_utils.py +157 -0
  55. sota-implementations/grpo/README.md +273 -0
  56. sota-implementations/grpo/grpo-async.py +437 -0
  57. sota-implementations/grpo/grpo-sync.py +435 -0
  58. sota-implementations/grpo/grpo_utils.py +843 -0
  59. sota-implementations/grpo/requirements_gsm8k.txt +11 -0
  60. sota-implementations/grpo/requirements_ifeval.txt +16 -0
  61. sota-implementations/impala/README.md +33 -0
  62. sota-implementations/impala/impala_multi_node_ray.py +292 -0
  63. sota-implementations/impala/impala_multi_node_submitit.py +284 -0
  64. sota-implementations/impala/impala_single_node.py +261 -0
  65. sota-implementations/impala/utils.py +184 -0
  66. sota-implementations/iql/discrete_iql.py +230 -0
  67. sota-implementations/iql/iql_offline.py +164 -0
  68. sota-implementations/iql/iql_online.py +225 -0
  69. sota-implementations/iql/utils.py +437 -0
  70. sota-implementations/multiagent/README.md +74 -0
  71. sota-implementations/multiagent/iql.py +237 -0
  72. sota-implementations/multiagent/maddpg_iddpg.py +266 -0
  73. sota-implementations/multiagent/mappo_ippo.py +267 -0
  74. sota-implementations/multiagent/qmix_vdn.py +271 -0
  75. sota-implementations/multiagent/sac.py +337 -0
  76. sota-implementations/multiagent/utils/__init__.py +4 -0
  77. sota-implementations/multiagent/utils/logging.py +151 -0
  78. sota-implementations/multiagent/utils/utils.py +43 -0
  79. sota-implementations/ppo/README.md +29 -0
  80. sota-implementations/ppo/ppo_atari.py +305 -0
  81. sota-implementations/ppo/ppo_mujoco.py +293 -0
  82. sota-implementations/ppo/utils_atari.py +238 -0
  83. sota-implementations/ppo/utils_mujoco.py +152 -0
  84. sota-implementations/ppo_trainer/train.py +21 -0
  85. sota-implementations/redq/README.md +7 -0
  86. sota-implementations/redq/redq.py +199 -0
  87. sota-implementations/redq/utils.py +1060 -0
  88. sota-implementations/sac/sac-async.py +266 -0
  89. sota-implementations/sac/sac.py +239 -0
  90. sota-implementations/sac/utils.py +381 -0
  91. sota-implementations/sac_trainer/train.py +16 -0
  92. sota-implementations/td3/td3.py +254 -0
  93. sota-implementations/td3/utils.py +319 -0
  94. sota-implementations/td3_bc/td3_bc.py +177 -0
  95. sota-implementations/td3_bc/utils.py +251 -0
  96. torchrl/.dylibs/libc++.1.0.dylib +0 -0
  97. torchrl/__init__.py +144 -0
  98. torchrl/_extension.py +74 -0
  99. torchrl/_torchrl.cpython-314t-darwin.so +0 -0
  100. torchrl/_utils.py +1431 -0
  101. torchrl/collectors/__init__.py +48 -0
  102. torchrl/collectors/_base.py +1058 -0
  103. torchrl/collectors/_constants.py +88 -0
  104. torchrl/collectors/_multi_async.py +324 -0
  105. torchrl/collectors/_multi_base.py +1805 -0
  106. torchrl/collectors/_multi_sync.py +464 -0
  107. torchrl/collectors/_runner.py +581 -0
  108. torchrl/collectors/_single.py +2009 -0
  109. torchrl/collectors/_single_async.py +259 -0
  110. torchrl/collectors/collectors.py +62 -0
  111. torchrl/collectors/distributed/__init__.py +32 -0
  112. torchrl/collectors/distributed/default_configs.py +133 -0
  113. torchrl/collectors/distributed/generic.py +1306 -0
  114. torchrl/collectors/distributed/ray.py +1092 -0
  115. torchrl/collectors/distributed/rpc.py +1006 -0
  116. torchrl/collectors/distributed/sync.py +731 -0
  117. torchrl/collectors/distributed/utils.py +160 -0
  118. torchrl/collectors/llm/__init__.py +10 -0
  119. torchrl/collectors/llm/base.py +494 -0
  120. torchrl/collectors/llm/ray_collector.py +275 -0
  121. torchrl/collectors/llm/utils.py +36 -0
  122. torchrl/collectors/llm/weight_update/__init__.py +10 -0
  123. torchrl/collectors/llm/weight_update/vllm.py +348 -0
  124. torchrl/collectors/llm/weight_update/vllm_v2.py +311 -0
  125. torchrl/collectors/utils.py +433 -0
  126. torchrl/collectors/weight_update.py +591 -0
  127. torchrl/csrc/numpy_utils.h +38 -0
  128. torchrl/csrc/pybind.cpp +27 -0
  129. torchrl/csrc/segment_tree.h +458 -0
  130. torchrl/csrc/torch_utils.h +34 -0
  131. torchrl/csrc/utils.cpp +48 -0
  132. torchrl/csrc/utils.h +31 -0
  133. torchrl/data/__init__.py +187 -0
  134. torchrl/data/datasets/__init__.py +58 -0
  135. torchrl/data/datasets/atari_dqn.py +878 -0
  136. torchrl/data/datasets/common.py +281 -0
  137. torchrl/data/datasets/d4rl.py +489 -0
  138. torchrl/data/datasets/d4rl_infos.py +187 -0
  139. torchrl/data/datasets/gen_dgrl.py +375 -0
  140. torchrl/data/datasets/minari_data.py +643 -0
  141. torchrl/data/datasets/openml.py +177 -0
  142. torchrl/data/datasets/openx.py +798 -0
  143. torchrl/data/datasets/roboset.py +363 -0
  144. torchrl/data/datasets/utils.py +11 -0
  145. torchrl/data/datasets/vd4rl.py +432 -0
  146. torchrl/data/llm/__init__.py +34 -0
  147. torchrl/data/llm/dataset.py +491 -0
  148. torchrl/data/llm/history.py +1378 -0
  149. torchrl/data/llm/prompt.py +198 -0
  150. torchrl/data/llm/reward.py +225 -0
  151. torchrl/data/llm/topk.py +186 -0
  152. torchrl/data/llm/utils.py +543 -0
  153. torchrl/data/map/__init__.py +21 -0
  154. torchrl/data/map/hash.py +185 -0
  155. torchrl/data/map/query.py +204 -0
  156. torchrl/data/map/tdstorage.py +363 -0
  157. torchrl/data/map/tree.py +1434 -0
  158. torchrl/data/map/utils.py +103 -0
  159. torchrl/data/postprocs/__init__.py +8 -0
  160. torchrl/data/postprocs/postprocs.py +391 -0
  161. torchrl/data/replay_buffers/__init__.py +99 -0
  162. torchrl/data/replay_buffers/checkpointers.py +622 -0
  163. torchrl/data/replay_buffers/ray_buffer.py +292 -0
  164. torchrl/data/replay_buffers/replay_buffers.py +2376 -0
  165. torchrl/data/replay_buffers/samplers.py +2578 -0
  166. torchrl/data/replay_buffers/scheduler.py +265 -0
  167. torchrl/data/replay_buffers/storages.py +2412 -0
  168. torchrl/data/replay_buffers/utils.py +1042 -0
  169. torchrl/data/replay_buffers/writers.py +781 -0
  170. torchrl/data/tensor_specs.py +7101 -0
  171. torchrl/data/utils.py +334 -0
  172. torchrl/envs/__init__.py +265 -0
  173. torchrl/envs/async_envs.py +1105 -0
  174. torchrl/envs/batched_envs.py +3093 -0
  175. torchrl/envs/common.py +4241 -0
  176. torchrl/envs/custom/__init__.py +11 -0
  177. torchrl/envs/custom/chess.py +617 -0
  178. torchrl/envs/custom/llm.py +214 -0
  179. torchrl/envs/custom/pendulum.py +401 -0
  180. torchrl/envs/custom/san_moves.txt +29274 -0
  181. torchrl/envs/custom/tictactoeenv.py +288 -0
  182. torchrl/envs/env_creator.py +263 -0
  183. torchrl/envs/gym_like.py +752 -0
  184. torchrl/envs/libs/__init__.py +68 -0
  185. torchrl/envs/libs/_gym_utils.py +326 -0
  186. torchrl/envs/libs/brax.py +846 -0
  187. torchrl/envs/libs/dm_control.py +544 -0
  188. torchrl/envs/libs/envpool.py +447 -0
  189. torchrl/envs/libs/gym.py +2239 -0
  190. torchrl/envs/libs/habitat.py +138 -0
  191. torchrl/envs/libs/isaac_lab.py +87 -0
  192. torchrl/envs/libs/isaacgym.py +203 -0
  193. torchrl/envs/libs/jax_utils.py +166 -0
  194. torchrl/envs/libs/jumanji.py +963 -0
  195. torchrl/envs/libs/meltingpot.py +599 -0
  196. torchrl/envs/libs/openml.py +153 -0
  197. torchrl/envs/libs/openspiel.py +652 -0
  198. torchrl/envs/libs/pettingzoo.py +1042 -0
  199. torchrl/envs/libs/procgen.py +351 -0
  200. torchrl/envs/libs/robohive.py +429 -0
  201. torchrl/envs/libs/smacv2.py +645 -0
  202. torchrl/envs/libs/unity_mlagents.py +891 -0
  203. torchrl/envs/libs/utils.py +147 -0
  204. torchrl/envs/libs/vmas.py +813 -0
  205. torchrl/envs/llm/__init__.py +63 -0
  206. torchrl/envs/llm/chat.py +730 -0
  207. torchrl/envs/llm/datasets/README.md +4 -0
  208. torchrl/envs/llm/datasets/__init__.py +17 -0
  209. torchrl/envs/llm/datasets/gsm8k.py +353 -0
  210. torchrl/envs/llm/datasets/ifeval.py +274 -0
  211. torchrl/envs/llm/envs.py +789 -0
  212. torchrl/envs/llm/libs/README.md +3 -0
  213. torchrl/envs/llm/libs/__init__.py +8 -0
  214. torchrl/envs/llm/libs/mlgym.py +869 -0
  215. torchrl/envs/llm/reward/__init__.py +10 -0
  216. torchrl/envs/llm/reward/gsm8k.py +324 -0
  217. torchrl/envs/llm/reward/ifeval/README.md +13 -0
  218. torchrl/envs/llm/reward/ifeval/__init__.py +10 -0
  219. torchrl/envs/llm/reward/ifeval/_instructions.py +1667 -0
  220. torchrl/envs/llm/reward/ifeval/_instructions_main.py +131 -0
  221. torchrl/envs/llm/reward/ifeval/_instructions_registry.py +100 -0
  222. torchrl/envs/llm/reward/ifeval/_instructions_util.py +1677 -0
  223. torchrl/envs/llm/reward/ifeval/_scorer.py +454 -0
  224. torchrl/envs/llm/transforms/__init__.py +55 -0
  225. torchrl/envs/llm/transforms/browser.py +292 -0
  226. torchrl/envs/llm/transforms/dataloading.py +859 -0
  227. torchrl/envs/llm/transforms/format.py +73 -0
  228. torchrl/envs/llm/transforms/kl.py +1544 -0
  229. torchrl/envs/llm/transforms/policy_version.py +189 -0
  230. torchrl/envs/llm/transforms/reason.py +323 -0
  231. torchrl/envs/llm/transforms/tokenizer.py +321 -0
  232. torchrl/envs/llm/transforms/tools.py +1955 -0
  233. torchrl/envs/model_based/__init__.py +9 -0
  234. torchrl/envs/model_based/common.py +180 -0
  235. torchrl/envs/model_based/dreamer.py +112 -0
  236. torchrl/envs/transforms/__init__.py +147 -0
  237. torchrl/envs/transforms/functional.py +48 -0
  238. torchrl/envs/transforms/gym_transforms.py +203 -0
  239. torchrl/envs/transforms/module.py +341 -0
  240. torchrl/envs/transforms/r3m.py +372 -0
  241. torchrl/envs/transforms/ray_service.py +663 -0
  242. torchrl/envs/transforms/rb_transforms.py +214 -0
  243. torchrl/envs/transforms/transforms.py +11835 -0
  244. torchrl/envs/transforms/utils.py +94 -0
  245. torchrl/envs/transforms/vc1.py +307 -0
  246. torchrl/envs/transforms/vecnorm.py +845 -0
  247. torchrl/envs/transforms/vip.py +407 -0
  248. torchrl/envs/utils.py +1718 -0
  249. torchrl/envs/vec_envs.py +11 -0
  250. torchrl/modules/__init__.py +206 -0
  251. torchrl/modules/distributions/__init__.py +73 -0
  252. torchrl/modules/distributions/continuous.py +830 -0
  253. torchrl/modules/distributions/discrete.py +908 -0
  254. torchrl/modules/distributions/truncated_normal.py +187 -0
  255. torchrl/modules/distributions/utils.py +233 -0
  256. torchrl/modules/llm/__init__.py +62 -0
  257. torchrl/modules/llm/backends/__init__.py +65 -0
  258. torchrl/modules/llm/backends/vllm/__init__.py +94 -0
  259. torchrl/modules/llm/backends/vllm/_models.py +46 -0
  260. torchrl/modules/llm/backends/vllm/base.py +72 -0
  261. torchrl/modules/llm/backends/vllm/vllm_async.py +2075 -0
  262. torchrl/modules/llm/backends/vllm/vllm_plugin.py +22 -0
  263. torchrl/modules/llm/backends/vllm/vllm_sync.py +446 -0
  264. torchrl/modules/llm/backends/vllm/vllm_utils.py +129 -0
  265. torchrl/modules/llm/policies/__init__.py +28 -0
  266. torchrl/modules/llm/policies/common.py +1809 -0
  267. torchrl/modules/llm/policies/transformers_wrapper.py +2756 -0
  268. torchrl/modules/llm/policies/vllm_wrapper.py +2241 -0
  269. torchrl/modules/llm/utils.py +23 -0
  270. torchrl/modules/mcts/__init__.py +21 -0
  271. torchrl/modules/mcts/scores.py +579 -0
  272. torchrl/modules/models/__init__.py +86 -0
  273. torchrl/modules/models/batchrenorm.py +119 -0
  274. torchrl/modules/models/decision_transformer.py +179 -0
  275. torchrl/modules/models/exploration.py +731 -0
  276. torchrl/modules/models/llm.py +156 -0
  277. torchrl/modules/models/model_based.py +596 -0
  278. torchrl/modules/models/models.py +1712 -0
  279. torchrl/modules/models/multiagent.py +1067 -0
  280. torchrl/modules/models/recipes/impala.py +185 -0
  281. torchrl/modules/models/utils.py +162 -0
  282. torchrl/modules/planners/__init__.py +10 -0
  283. torchrl/modules/planners/cem.py +228 -0
  284. torchrl/modules/planners/common.py +73 -0
  285. torchrl/modules/planners/mppi.py +265 -0
  286. torchrl/modules/tensordict_module/__init__.py +89 -0
  287. torchrl/modules/tensordict_module/actors.py +2457 -0
  288. torchrl/modules/tensordict_module/common.py +529 -0
  289. torchrl/modules/tensordict_module/exploration.py +814 -0
  290. torchrl/modules/tensordict_module/probabilistic.py +321 -0
  291. torchrl/modules/tensordict_module/rnn.py +1639 -0
  292. torchrl/modules/tensordict_module/sequence.py +132 -0
  293. torchrl/modules/tensordict_module/world_models.py +34 -0
  294. torchrl/modules/utils/__init__.py +38 -0
  295. torchrl/modules/utils/mappings.py +9 -0
  296. torchrl/modules/utils/utils.py +89 -0
  297. torchrl/objectives/__init__.py +78 -0
  298. torchrl/objectives/a2c.py +659 -0
  299. torchrl/objectives/common.py +753 -0
  300. torchrl/objectives/cql.py +1346 -0
  301. torchrl/objectives/crossq.py +710 -0
  302. torchrl/objectives/ddpg.py +453 -0
  303. torchrl/objectives/decision_transformer.py +371 -0
  304. torchrl/objectives/deprecated.py +516 -0
  305. torchrl/objectives/dqn.py +683 -0
  306. torchrl/objectives/dreamer.py +488 -0
  307. torchrl/objectives/functional.py +48 -0
  308. torchrl/objectives/gail.py +258 -0
  309. torchrl/objectives/iql.py +996 -0
  310. torchrl/objectives/llm/__init__.py +30 -0
  311. torchrl/objectives/llm/grpo.py +846 -0
  312. torchrl/objectives/llm/sft.py +482 -0
  313. torchrl/objectives/multiagent/__init__.py +8 -0
  314. torchrl/objectives/multiagent/qmixer.py +396 -0
  315. torchrl/objectives/ppo.py +1669 -0
  316. torchrl/objectives/redq.py +683 -0
  317. torchrl/objectives/reinforce.py +530 -0
  318. torchrl/objectives/sac.py +1580 -0
  319. torchrl/objectives/td3.py +570 -0
  320. torchrl/objectives/td3_bc.py +625 -0
  321. torchrl/objectives/utils.py +782 -0
  322. torchrl/objectives/value/__init__.py +28 -0
  323. torchrl/objectives/value/advantages.py +1956 -0
  324. torchrl/objectives/value/functional.py +1459 -0
  325. torchrl/objectives/value/utils.py +360 -0
  326. torchrl/record/__init__.py +17 -0
  327. torchrl/record/loggers/__init__.py +23 -0
  328. torchrl/record/loggers/common.py +48 -0
  329. torchrl/record/loggers/csv.py +226 -0
  330. torchrl/record/loggers/mlflow.py +142 -0
  331. torchrl/record/loggers/tensorboard.py +139 -0
  332. torchrl/record/loggers/trackio.py +163 -0
  333. torchrl/record/loggers/utils.py +78 -0
  334. torchrl/record/loggers/wandb.py +214 -0
  335. torchrl/record/recorder.py +554 -0
  336. torchrl/services/__init__.py +79 -0
  337. torchrl/services/base.py +109 -0
  338. torchrl/services/ray_service.py +453 -0
  339. torchrl/testing/__init__.py +107 -0
  340. torchrl/testing/assertions.py +179 -0
  341. torchrl/testing/dist_utils.py +122 -0
  342. torchrl/testing/env_creators.py +227 -0
  343. torchrl/testing/env_helper.py +35 -0
  344. torchrl/testing/gym_helpers.py +156 -0
  345. torchrl/testing/llm_mocks.py +119 -0
  346. torchrl/testing/mocking_classes.py +2720 -0
  347. torchrl/testing/modules.py +295 -0
  348. torchrl/testing/mp_helpers.py +15 -0
  349. torchrl/testing/ray_helpers.py +293 -0
  350. torchrl/testing/utils.py +190 -0
  351. torchrl/trainers/__init__.py +42 -0
  352. torchrl/trainers/algorithms/__init__.py +11 -0
  353. torchrl/trainers/algorithms/configs/__init__.py +705 -0
  354. torchrl/trainers/algorithms/configs/collectors.py +216 -0
  355. torchrl/trainers/algorithms/configs/common.py +41 -0
  356. torchrl/trainers/algorithms/configs/data.py +308 -0
  357. torchrl/trainers/algorithms/configs/envs.py +104 -0
  358. torchrl/trainers/algorithms/configs/envs_libs.py +361 -0
  359. torchrl/trainers/algorithms/configs/logging.py +80 -0
  360. torchrl/trainers/algorithms/configs/modules.py +570 -0
  361. torchrl/trainers/algorithms/configs/objectives.py +177 -0
  362. torchrl/trainers/algorithms/configs/trainers.py +340 -0
  363. torchrl/trainers/algorithms/configs/transforms.py +955 -0
  364. torchrl/trainers/algorithms/configs/utils.py +252 -0
  365. torchrl/trainers/algorithms/configs/weight_sync_schemes.py +191 -0
  366. torchrl/trainers/algorithms/configs/weight_update.py +159 -0
  367. torchrl/trainers/algorithms/ppo.py +373 -0
  368. torchrl/trainers/algorithms/sac.py +308 -0
  369. torchrl/trainers/helpers/__init__.py +40 -0
  370. torchrl/trainers/helpers/collectors.py +416 -0
  371. torchrl/trainers/helpers/envs.py +573 -0
  372. torchrl/trainers/helpers/logger.py +33 -0
  373. torchrl/trainers/helpers/losses.py +132 -0
  374. torchrl/trainers/helpers/models.py +658 -0
  375. torchrl/trainers/helpers/replay_buffer.py +59 -0
  376. torchrl/trainers/helpers/trainers.py +301 -0
  377. torchrl/trainers/trainers.py +2052 -0
  378. torchrl/weight_update/__init__.py +33 -0
  379. torchrl/weight_update/_distributed.py +749 -0
  380. torchrl/weight_update/_mp.py +624 -0
  381. torchrl/weight_update/_noupdate.py +102 -0
  382. torchrl/weight_update/_ray.py +1032 -0
  383. torchrl/weight_update/_rpc.py +284 -0
  384. torchrl/weight_update/_shared.py +891 -0
  385. torchrl/weight_update/llm/__init__.py +32 -0
  386. torchrl/weight_update/llm/vllm_double_buffer.py +370 -0
  387. torchrl/weight_update/llm/vllm_nccl.py +710 -0
  388. torchrl/weight_update/utils.py +73 -0
  389. torchrl/weight_update/weight_sync_schemes.py +1244 -0
  390. torchrl-0.11.0.dist-info/METADATA +1308 -0
  391. torchrl-0.11.0.dist-info/RECORD +395 -0
  392. torchrl-0.11.0.dist-info/WHEEL +5 -0
  393. torchrl-0.11.0.dist-info/entry_points.txt +2 -0
  394. torchrl-0.11.0.dist-info/licenses/LICENSE +21 -0
  395. torchrl-0.11.0.dist-info/top_level.txt +7 -0
@@ -0,0 +1,846 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+ from __future__ import annotations
6
+
7
+ import contextlib
8
+
9
+ from collections import defaultdict, deque
10
+ from dataclasses import dataclass
11
+ from typing import Literal, TypeVar
12
+
13
+ import torch
14
+ from tensordict import (
15
+ is_tensor_collection,
16
+ NestedKey,
17
+ TensorClass,
18
+ TensorDict,
19
+ TensorDictBase,
20
+ )
21
+ from tensordict.nn import (
22
+ CompositeDistribution,
23
+ ProbabilisticTensorDictModule,
24
+ ProbabilisticTensorDictSequential,
25
+ set_composite_lp_aggregate,
26
+ )
27
+ from tensordict.utils import expand_as_right
28
+ from torch import distributions as d
29
+ from torchrl._utils import logger as torchrl_logger, VERBOSE
30
+ from torchrl.envs.transforms.transforms import Transform
31
+ from torchrl.modules.llm import LLMWrapperBase
32
+ from torchrl.objectives.common import LossModule
33
+ from torchrl.objectives.utils import _reduce, _sum_td_features
34
+
35
+
36
+ class LLMLossOutput(TensorClass["nocast"]):
37
+ """Base class for LLM loss outputs.
38
+
39
+ This base class defines the common structure for all LLM-based policy optimization
40
+ loss outputs (GRPO, DAPO, CISPO, etc.).
41
+ """
42
+
43
+ loss_objective: torch.Tensor
44
+ clip_fraction: torch.Tensor
45
+ kl_approx: torch.Tensor
46
+ ESS: torch.Tensor
47
+ entropy: torch.Tensor | None = None
48
+ loss_entropy: torch.Tensor | None = None
49
+ loss_kl_to_ref: torch.Tensor | None = None
50
+ kl_to_ref: torch.Tensor | None = None
51
+ loss_kl_to_inference: torch.Tensor | None = None
52
+ kl_to_inference: torch.Tensor | None = None
53
+
54
+
55
+ LLMOutputType = TypeVar("LLMOutputType", bound=LLMLossOutput)
56
+
57
+
58
+ class GRPOLossOutput(LLMLossOutput):
59
+ """GRPO Loss Output."""
60
+
61
+
62
+ class DAPOLossOutput(LLMLossOutput):
63
+ """DAPO Loss Output."""
64
+
65
+
66
+ class CISPOLossOutput(LLMLossOutput):
67
+ """CISPO Loss Output."""
68
+
69
+
70
+ class GRPOLoss(LossModule):
71
+ """GRPO loss.
72
+
73
+ The clipped importance weighted loss is computed as follows::
74
+
75
+ loss = -min( weight * advantage, min(max(weight, 1-eps), 1+eps) * advantage)
76
+
77
+ Args:
78
+ actor_network (LLMWrapperBase): policy operator.
79
+
80
+ .. note::
81
+ It is critical to keep your model in eval mode during GRPO training to ensure deterministic behavior and correct
82
+ importance sampling. A mismatch between train and eval modes is a common cause of instability or failure to learn
83
+ in RL post-training.
84
+
85
+ .. note::
86
+ The Effective Sample Size (ESS) is a key diagnostic metric in GRPO. ESS measures the effective number of samples
87
+ in the batch, computed as the inverse of the sum of the squared importance weights.
88
+ A value of 1 indicates that all importance weights are equal (ideal case). If ESS drops or increases significantly,
89
+ it usually indicates a problem with the model configuration, such as a train/eval mode mismatch or a large policy update.
90
+
91
+ .. note::
92
+ The masking_strategy parameter is crucial for LLM training scenarios. It determines which tokens are included
93
+ in the loss computation:
94
+ - "sft": Only response tokens (excludes prompt tokens) - suitable for single-turn conversations
95
+ - "rlhf": Only assistant tokens (excludes user/system tokens) - suitable for multi-turn conversations
96
+ - "generic": All valid tokens (excludes padding tokens) - suitable for generic scenarios
97
+
98
+ The masking strategy must match the strategy used for advantage computation to avoid shape mismatches.
99
+
100
+ Keyword Args:
101
+ clip_epsilon (float | tuple[float, float], optional): clipping threshold(s) for the clipped surrogate.
102
+ - float x: symmetric clipping [1 - x, 1 + x] (default: 0.2)
103
+ - tuple (eps_low, eps_high): asymmetric clipping [1 - eps_low, 1 + eps_high] as in DAPO Clip-Higher
104
+ recommended defaults from DAPO: (0.20, 0.28); see Eq. (10) in the paper.
105
+ kl_mask_threshold (float | None, optional): enable token-wise trust-region filtering (KL-Mask).
106
+ When set, tokens with 0.5 * (log(pi_theta/pi_ref))^2 > kl_mask_threshold are masked out from the loss.
107
+ This stabilizes updates by skipping tokens that drifted too far from the reference distribution
108
+ (see table and description; enables per-token trust region).
109
+ aggregation (str, optional): loss aggregation strategy for the policy objective.
110
+ - "token_mean": global masked token mean (weights long sequences more). Default.
111
+ - "prompt_mean": per-sample masked mean over tokens, then mean across samples (equal sample weight).
112
+ - "none": return per-token loss (mask applied, no aggregation). Useful for downstream custom reductions.
113
+ entropy_bonus (bool, optional): if ``True``, an entropy bonus will be added to the
114
+ loss to favour exploratory policies.
115
+ samples_mc_entropy (int, optional): if the distribution retrieved from the policy
116
+ operator does not have a closed form
117
+ formula for the entropy, a Monte-Carlo estimate will be used.
118
+ ``samples_mc_entropy`` will control how many
119
+ samples will be used to compute this estimate.
120
+ Defaults to ``1``.
121
+ entropy_coeff (scalar, optional): entropy multiplier when computing the total loss.
122
+ Defaults to ``0.01``.
123
+ advantage_key (str, optional): [Deprecated, use set_keys(advantage_key=advantage_key) instead]
124
+ The input tensordict key where the advantage is
125
+ expected to be written. Defaults to ``"advantage"``.
126
+ reduction (str, optional): Specifies the reduction to apply to the output:
127
+ ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied,
128
+ ``"mean"``: the sum of the output will be divided by the number of
129
+ elements in the output, ``"sum"``: the output will be summed. Default: ``"mean"``.
130
+ clip_value (bool or float, optional): If a ``float`` is provided, it will be used to compute a clipped
131
+ version of the value prediction with respect to the input tensordict value estimate and use it to
132
+ calculate the value loss. The purpose of clipping is to limit the impact of extreme value predictions,
133
+ helping stabilize training and preventing large updates. However, it will have no impact if the value
134
+ estimate was done by the current version of the value estimator. If instead ``True`` is provided, the
135
+ ``clip_epsilon`` parameter will be used as the clipping threshold. If not provided or ``False``, no
136
+ clipping will be performed. Defaults to ``False``.
137
+ kl_to_ref_coeff (float, optional): coefficient for the KL divergence to the reference policy. Defaults to ``None`` (no KL divergence).
138
+ kl_to_inference_coeff (float, optional): coefficient for the KL divergence to the inference policy. Defaults to ``None`` (no KL divergence).
139
+ device (torch.device, optional): device of the buffers. Defaults to ``None``.
140
+ masking_strategy (Literal["sft", "rlhf", "generic"], optional): The masking strategy to use for distribution creation.
141
+ - "sft": Use prompt masking (response tokens only, suitable for single-turn)
142
+ - "rlhf": Use assistant masking (assistant tokens only, suitable for multi-turn)
143
+ - "generic": Use attention masking (all valid tokens)
144
+ Defaults to "sft" since we can't guarantee assistant masks are available.
145
+
146
+ .. note:: Parameters and buffers from the policy / critic will not be cast to that device to ensure that
147
+ the storages match the ones that are passed to other components, such as data collectors.
148
+
149
+ .. note:: For non-symmetric clipping thresholds, see the `DAPO <https://arxiv.org/html/2503.14476>`_ paper.
150
+
151
+ """
152
+
153
+ actor_network: LLMWrapperBase
154
+ output_type: type[LLMLossOutput] = GRPOLossOutput
155
+
156
+ @dataclass
157
+ class _AcceptedKeys(LossModule._AcceptedKeys):
158
+ """Maintains default values for all configurable tensordict keys.
159
+
160
+ This class defines which tensordict keys can be set using '.set_keys(key_name=key_value)' and their
161
+ default values
162
+ """
163
+
164
+ advantage: NestedKey = "advantage"
165
+ action: NestedKey = ("tokens", "full")
166
+ sample_log_prob: NestedKey = ("log_probs", "full")
167
+ ref_log_probs: NestedKey = ("next", "ref_log_probs", "full")
168
+
169
+ @property
170
+ def tensor_keys(self) -> _AcceptedKeys:
171
+ """Access the tensordict key configuration for this loss.
172
+
173
+ This property provides access to the configurable keys used by the loss module
174
+ to read tensors from input TensorDicts. These keys include:
175
+
176
+ - ``advantage``: key for the advantage values
177
+ - ``action``: key for the action tokens (default: ``("tokens", "full")``)
178
+ - ``sample_log_prob``: key for the log probabilities from the reference policy (default: ``("log_probs", "full")``)
179
+ - ``ref_log_probs``: key for the reference policy log probabilities (default: ``("next", "ref_log_probs", "full")``)
180
+
181
+ To modify these keys, use the :meth:`~.set_keys` method.
182
+
183
+ Examples:
184
+ >>> loss = GRPOLoss(actor_network)
185
+ >>> # Access current keys
186
+ >>> print(loss.tensor_keys.advantage) # "advantage"
187
+ >>> # Modify keys
188
+ >>> loss.set_keys(advantage="my_advantage_key")
189
+ >>> print(loss.tensor_keys.advantage) # "my_advantage_key"
190
+
191
+ Returns:
192
+ An instance of _AcceptedKeys containing all configurable tensordict keys.
193
+ """
194
+ return self._tensor_keys
195
+
196
+ def __init__(
197
+ self,
198
+ actor_network: LLMWrapperBase | None = None,
199
+ *,
200
+ clip_epsilon: float | tuple[float, float] = 0.2,
201
+ kl_mask_threshold: float | None = None,
202
+ aggregation: str | None = "token_mean",
203
+ entropy_bonus: bool = True,
204
+ samples_mc_entropy: int = 1,
205
+ entropy_coeff: float = 0.01,
206
+ gamma: float | None = None,
207
+ reduction: str | None = None,
208
+ clip_value: bool | float | None = None,
209
+ kl_to_ref_coeff: float | None = None,
210
+ kl_to_inference_coeff: float | None = None,
211
+ device: torch.device | None = None,
212
+ masking_strategy: Literal["sft", "rlhf", "generic"] = "sft",
213
+ **kwargs,
214
+ ):
215
+ super().__init__()
216
+ # Core modules and hyper-parameters
217
+ self.actor_network = actor_network
218
+ self.entropy_bonus = entropy_bonus
219
+ self.samples_mc_entropy = samples_mc_entropy
220
+ self.entropy_coeff = entropy_coeff
221
+ self.reduction = reduction if reduction is not None else "mean"
222
+ self.kl_mask_threshold = kl_mask_threshold
223
+ self.aggregation = aggregation or "token_mean"
224
+
225
+ # Determine device and register clip epsilon as buffer
226
+ if device is None:
227
+ try:
228
+ device = next(self.parameters()).device
229
+ except (AttributeError, StopIteration):
230
+ device = getattr(
231
+ torch, "get_default_device", lambda: torch.device("cpu")
232
+ )()
233
+ # Accept symmetric or asymmetric thresholds
234
+ if isinstance(clip_epsilon, (tuple, list)):
235
+ if len(clip_epsilon) != 2:
236
+ raise ValueError(
237
+ f"clip_epsilon tuple must have length 2, got {clip_epsilon}."
238
+ )
239
+ eps_low, eps_high = clip_epsilon
240
+ else:
241
+ eps_low = float(clip_epsilon)
242
+ eps_high = float(clip_epsilon)
243
+ # Basic validation
244
+ if eps_low < 0 or eps_high < 0:
245
+ raise ValueError(
246
+ f"clip_epsilon values must be non-negative, got ({eps_low}, {eps_high})."
247
+ )
248
+ if eps_low >= 1.0:
249
+ raise ValueError(
250
+ f"clip_epsilon low must be < 1 (to keep 1 - eps_low > 0), got {eps_low}."
251
+ )
252
+ # Register buffers
253
+ self.register_buffer("clip_epsilon_low", torch.tensor(eps_low, device=device))
254
+ self.register_buffer("clip_epsilon_high", torch.tensor(eps_high, device=device))
255
+
256
+ self.masking_strategy = masking_strategy
257
+ # Defaults for keys
258
+ self.set_keys(sample_log_prob=("log_probs", "full"), action=("tokens", "full"))
259
+ # KL coefficients
260
+ self.kl_to_ref_coeff = kl_to_ref_coeff
261
+ self.kl_to_inference_coeff = kl_to_inference_coeff
262
+ # Prepare IO keys
263
+ self._set_in_keys()
264
+
265
+ @property
266
+ def _clip_bounds(self):
267
+ # Returns (log(1 - eps_low), log(1 + eps_high)) for clamping log-weight
268
+ return (
269
+ (-self.clip_epsilon_low).log1p(),
270
+ self.clip_epsilon_high.log1p(),
271
+ )
272
+
273
+ def _set_in_keys(self):
274
+ keys = []
275
+ if getattr(self, "actor_network", None) is not None and hasattr(
276
+ self.actor_network, "in_keys"
277
+ ):
278
+ in_keys = self.actor_network.in_keys
279
+ if isinstance(in_keys, (list, tuple)):
280
+ keys.extend(in_keys)
281
+ keys.append(self.tensor_keys.action)
282
+ keys.append(self.tensor_keys.sample_log_prob)
283
+ keys.append(self.tensor_keys.advantage)
284
+ keys.append(self.tensor_keys.ref_log_probs)
285
+ self._in_keys = list(dict.fromkeys(keys))
286
+
287
+ @property
288
+ def in_keys(self):
289
+ if getattr(self, "_in_keys", None) is None:
290
+ self._set_in_keys()
291
+ return self._in_keys
292
+
293
+ @in_keys.setter
294
+ def in_keys(self, values):
295
+ self._in_keys = values
296
+
297
+ @property
298
+ def out_keys(self):
299
+ if getattr(self, "_out_keys", None) is None:
300
+ keys = ["loss_objective", "clip_fraction", "ESS", "kl_approx"]
301
+ if self.entropy_bonus:
302
+ keys.extend(["entropy", "loss_entropy"])
303
+ keys.extend(
304
+ [
305
+ "loss_kl_to_ref",
306
+ "kl_to_ref",
307
+ "loss_kl_to_inference",
308
+ "kl_to_inference",
309
+ ]
310
+ )
311
+ self._out_keys = keys
312
+ return self._out_keys
313
+
314
+ @out_keys.setter
315
+ def out_keys(self, values):
316
+ self._out_keys = values
317
+
318
+ def _forward_value_estimator_keys(self, **kwargs) -> None:
319
+ # No value estimator in GRPO; simply refresh input keys
320
+ self._set_in_keys()
321
+
322
+ def _get_cur_log_prob(self, tensordict):
323
+ """Override to use LLM-specific distribution with explicit masking strategy.
324
+
325
+ This ensures that the loss is computed with the correct masking strategy,
326
+ and provides helpful error messages when there are shape mismatches.
327
+ """
328
+ if isinstance(
329
+ self.actor_network,
330
+ (ProbabilisticTensorDictSequential, ProbabilisticTensorDictModule),
331
+ ) or hasattr(self.actor_network, "get_dist"):
332
+ # Use the specified masking strategy
333
+ # dists are always defined over the whole sequence, so we can re-use the mask as the dist will always
334
+ # be a MaskedCategorical
335
+ # TODO: eventually, we want to always use `get_dist` and just pass the key of the mask
336
+ # Masks should contain: prompt and response masks, assistant, and attention.
337
+ # Additionally, we should make sure that the masks are properly updated when log-probs is called (using vllm and transformers)
338
+ # because in some instances it looks like they can be overwritten with None values.
339
+ if self.masking_strategy == "sft" and hasattr(
340
+ self.actor_network, "_get_sft_dist"
341
+ ):
342
+ dist = self.actor_network._get_sft_dist(tensordict)
343
+ elif self.masking_strategy == "rlhf" and hasattr(
344
+ self.actor_network, "_get_rlhf_dist"
345
+ ):
346
+ dist = self.actor_network._get_rlhf_dist(tensordict)
347
+ elif self.masking_strategy == "generic" and hasattr(
348
+ self.actor_network, "_get_generic_dist"
349
+ ):
350
+ dist = self.actor_network._get_generic_dist(tensordict)
351
+ elif hasattr(self.actor_network, "get_dist"):
352
+ # Fallback to generic distribution method
353
+ dist = self.actor_network.get_dist(
354
+ tensordict,
355
+ logits_key="logits",
356
+ )
357
+ else:
358
+ raise NotImplementedError(
359
+ f"Actor network must have get_dist method or the appropriate method for "
360
+ f"masking strategy '{self.masking_strategy}'."
361
+ )
362
+
363
+ action = tensordict.get(
364
+ self.tensor_keys.action,
365
+ as_padded_tensor=True,
366
+ padding_side="left",
367
+ padding_value=-100,
368
+ )
369
+ log_prob = dist.log_prob(action)
370
+ else:
371
+ raise NotImplementedError(
372
+ "Only probabilistic modules from tensordict.nn are currently supported. "
373
+ "If you need to implement a custom logic to retrieve the log-probs (to compute "
374
+ "the PPO objective) or the distribution (for the PPO entropy), please augment "
375
+ f"the {type(self).__class__} by implementing your own logic in _get_cur_log_prob."
376
+ )
377
+ return log_prob, dist, False
378
+
379
+ def forward(self, tensordict: TensorDictBase) -> LLMOutputType:
380
+ # Some sanity checks and housekeeping:
381
+ # - We may not have the tokens yet. If not, we will use the tokenizer of the actor to tokenize the text.
382
+ # We default to history rather than text because the history will account for multiturn, or multimodal inputs.
383
+ if self.tensor_keys.action not in tensordict:
384
+ raise ValueError(f"Action key {self.tensor_keys.action} not in tensordict.")
385
+
386
+ tensordict = tensordict.copy()
387
+ advantage = tensordict.get(
388
+ self.tensor_keys.advantage, None, as_padded_tensor=True
389
+ )
390
+ if advantage is None:
391
+ raise ValueError(
392
+ f"Advantage key {self.tensor_keys.advantage} not in tensordict."
393
+ )
394
+ log_weight, dist, kl_approx = self._log_weight(
395
+ tensordict, adv_shape=advantage.shape[:-1]
396
+ )
397
+ mask = dist.mask
398
+
399
+ # Optional per-token trust-region filtering (KL-Mask) vs reference policy
400
+ if self.kl_mask_threshold is not None and self.kl_mask_threshold > 0:
401
+ try:
402
+ inference_log_prob = tensordict.get(
403
+ self.tensor_keys.sample_log_prob,
404
+ as_padded_tensor=True,
405
+ padding_side="left",
406
+ padding_value=0.0,
407
+ )
408
+ except KeyError:
409
+ inference_log_prob = None
410
+ cur_log_prob = tensordict.get("_cur_log_prob", None)
411
+ if (inference_log_prob is not None) and (cur_log_prob is not None):
412
+ # Align to valid tokens only (safety)
413
+ cur_log_prob_masked = torch.where(
414
+ expand_as_right(mask, cur_log_prob), cur_log_prob, 0.0
415
+ )
416
+ inference_log_prob_masked = torch.where(
417
+ expand_as_right(mask, inference_log_prob), inference_log_prob, 0.0
418
+ )
419
+ log_is_ref = cur_log_prob_masked - inference_log_prob_masked
420
+ kl_token = 0.5 * (log_is_ref**2)
421
+ tr_mask = kl_token <= self.kl_mask_threshold
422
+ # Combine with attention mask
423
+ mask = mask & tr_mask
424
+ # ESS for logging
425
+ with torch.no_grad():
426
+ # In theory, ESS should be computed on particles sampled from the same source. Here we sample according
427
+ # to different, unrelated trajectories, which is not standard. Still, it can give an idea of the weights'
428
+ # dispersion.
429
+ lw = log_weight.squeeze(-1)[mask]
430
+ batch = mask.sum()
431
+ ess = (2 * lw.logsumexp(0) - (2 * lw).logsumexp(0)).exp()
432
+
433
+ if advantage.ndim != log_weight.ndim:
434
+ raise ValueError(
435
+ f"advantage and log_weight must have the same number of dimensions, got {advantage.ndim=} and {log_weight.ndim=}"
436
+ )
437
+ loss_objective, clip_fraction = self._compute_policy_objective(
438
+ log_weight, advantage
439
+ )
440
+ td_out = TensorDict({"loss_objective": loss_objective})
441
+ td_out.set("clip_fraction", clip_fraction)
442
+ td_out.set("kl_approx", kl_approx.detach().mean()) # for logging
443
+
444
+ if self.entropy_bonus:
445
+ entropy = self._get_entropy(dist, adv_shape=advantage.shape[:-1])
446
+ if is_tensor_collection(entropy):
447
+ # Reports the entropy of each action head.
448
+ td_out.set("composite_entropy", entropy.detach())
449
+ entropy = _sum_td_features(entropy)
450
+ td_out.set("entropy", entropy.detach().mean()) # for logging
451
+ td_out.set("loss_entropy", -self.entropy_coeff * entropy)
452
+
453
+ td_out.set("ESS", _reduce(ess / batch, self.reduction))
454
+ # Aggregate loss terms according to aggregation strategy
455
+ for key in list(td_out.keys()):
456
+ if isinstance(key, tuple) or not isinstance(key, str):
457
+ continue
458
+ if key.startswith("loss_"):
459
+ val = td_out.get(key)
460
+ td_out.set(key, self._aggregate_loss_value(val, mask))
461
+ if self.kl_to_ref_coeff is not None and self.kl_to_ref_coeff > 0:
462
+ # FIXME: parameterize this
463
+ loss_kl, kl_penalty = self._kl_to_ref(
464
+ tensordict,
465
+ mask=mask,
466
+ dist=dist,
467
+ ref_log_prob=tensordict.get(
468
+ self.tensor_keys.ref_log_probs,
469
+ as_padded_tensor=True,
470
+ padding_side="left",
471
+ padding_value=0.0,
472
+ ),
473
+ )
474
+ td_out["loss_kl_to_ref"] = loss_kl
475
+ td_out["kl_to_ref"] = kl_penalty.detach()
476
+ if self.kl_to_inference_coeff is not None:
477
+ loss_kl, kl_penalty = self._kl_to_ref(
478
+ tensordict,
479
+ key=self.tensor_keys.sample_log_prob,
480
+ coeff=self.kl_to_inference_coeff,
481
+ mask=mask,
482
+ dist=dist,
483
+ )
484
+ td_out["loss_kl_to_inference"] = loss_kl
485
+ td_out["kl_to_inference"] = kl_penalty.detach()
486
+ del tensordict["_cur_log_prob"]
487
+ return self.output_type.from_tensordict(td_out)
488
+
489
+ def _compute_policy_objective(
490
+ self, log_weight: torch.Tensor, advantage: torch.Tensor
491
+ ) -> tuple[torch.Tensor, torch.Tensor]:
492
+ """Default GRPO objective: PPO-style min between unclipped and clipped ratios.
493
+
494
+ Returns (loss_objective, clip_fraction).
495
+ """
496
+ gain1 = log_weight.exp() * advantage
497
+ log_weight_clip = log_weight.clamp(*self._clip_bounds)
498
+ clip_fraction = (log_weight_clip != log_weight).to(log_weight.dtype).mean()
499
+ ratio = log_weight_clip.exp()
500
+ gain2 = ratio * advantage
501
+ gain = torch.stack([gain1, gain2], -1).min(dim=-1).values
502
+ return -gain, clip_fraction
503
+
504
+ def _aggregate_loss_value(
505
+ self, value: torch.Tensor, mask: torch.Tensor
506
+ ) -> torch.Tensor:
507
+ """Aggregate a per-token loss tensor using the configured strategy.
508
+
509
+ Supports:
510
+ - token_mean: masked mean across all tokens (default)
511
+ - prompt_mean: per-sample masked mean over tokens, then mean across batch
512
+ - none: return per-token loss with masked-out tokens set to 0
513
+
514
+ The input `value` is expected to have shape [..., T, 1] where T is the token dimension,
515
+ and `mask` has shape [..., T].
516
+ """
517
+ if self.aggregation == "none" or self.reduction == "none":
518
+ mask_exp = expand_as_right(mask, value)
519
+ return torch.where(mask_exp, value, value.new_zeros(()).expand_as(value))
520
+
521
+ if self.aggregation == "prompt_mean":
522
+ # Mean over valid tokens per sample, then mean across batch
523
+ mask_exp = expand_as_right(mask, value).to(value.dtype)
524
+ token_sum = (value * mask_exp).sum(dim=-2, keepdim=False)
525
+ token_count = mask_exp.sum(dim=-2, keepdim=False).clamp_min(1.0)
526
+ sample_mean = token_sum / token_count
527
+ return sample_mean.mean(dim=0, keepdim=False)
528
+
529
+ # token_mean (global masked mean)
530
+ return _reduce(value, reduction="mean", mask=mask).squeeze(-1)
531
+
532
+ def _get_entropy(
533
+ self, dist: d.Distribution, adv_shape: torch.Size
534
+ ) -> torch.Tensor | TensorDict:
535
+ try:
536
+ entropy = dist.entropy()
537
+ if not entropy.isfinite().all():
538
+ del entropy
539
+ if VERBOSE:
540
+ torchrl_logger.info(
541
+ "Entropy is not finite. Using Monte Carlo sampling."
542
+ )
543
+ raise NotImplementedError
544
+ except NotImplementedError:
545
+ if VERBOSE:
546
+ torchrl_logger.warning(
547
+ f"Entropy not implemented for {type(dist)} or is not finite. Using Monte Carlo sampling."
548
+ )
549
+ if getattr(dist, "has_rsample", False):
550
+ x = dist.rsample((self.samples_mc_entropy,))
551
+ else:
552
+ x = dist.sample((self.samples_mc_entropy,))
553
+ with set_composite_lp_aggregate(False) if isinstance(
554
+ dist, CompositeDistribution
555
+ ) else contextlib.nullcontext():
556
+ log_prob = dist.log_prob(x)
557
+ if is_tensor_collection(log_prob):
558
+ if isinstance(self.tensor_keys.sample_log_prob, NestedKey):
559
+ log_prob = log_prob.get(self.tensor_keys.sample_log_prob)
560
+ else:
561
+ log_prob = log_prob.select(*self.tensor_keys.sample_log_prob)
562
+ entropy = -log_prob.mean(0)
563
+ if is_tensor_collection(entropy) and entropy.batch_size != adv_shape:
564
+ entropy.batch_size = adv_shape
565
+ return entropy.unsqueeze(-1)
566
+
567
+ def _kl_to_ref(
568
+ self,
569
+ tensordict: TensorDictBase,
570
+ key: NestedKey = ("next", "ref_log_probs"),
571
+ ref_log_prob: torch.Tensor | None = None,
572
+ coeff: float | None = None,
573
+ mask: torch.Tensor | None = None,
574
+ dist: d.Distribution | None = None,
575
+ ):
576
+ if coeff is None:
577
+ coeff = self.kl_to_ref_coeff
578
+ # TODO: customize this
579
+ if ref_log_prob is None:
580
+ ref_log_prob = tensordict.get(
581
+ key,
582
+ as_padded_tensor=True,
583
+ padding_side="left",
584
+ padding_value=0.0,
585
+ )
586
+ if ref_log_prob is None:
587
+ raise KeyError(
588
+ f"Couldn't find the ref log-prob {key} in the input data ({tensordict.keys(True)=})."
589
+ )
590
+ ref_log_prob = ref_log_prob.squeeze(-1)
591
+ cur_log_prob = tensordict.get("_cur_log_prob")
592
+ # TODO: remove this
593
+ if cur_log_prob.shape != ref_log_prob.shape:
594
+ raise ValueError(
595
+ f"cur_log_prob and ref_log_prob must have the same shape, got {cur_log_prob.shape=} and {ref_log_prob.shape=}"
596
+ )
597
+ if mask is not None:
598
+ ref_log_prob = torch.where(
599
+ expand_as_right(mask, ref_log_prob), ref_log_prob, 0.0
600
+ )
601
+ cur_log_prob = torch.where(
602
+ expand_as_right(mask, cur_log_prob), cur_log_prob, 0.0
603
+ )
604
+ diff = ref_log_prob - cur_log_prob
605
+ kl_penalty = (diff.expm1() - diff).mean()
606
+ return coeff * kl_penalty, kl_penalty
607
+
608
+ def _log_weight(
609
+ self, tensordict: TensorDictBase, adv_shape: torch.Size
610
+ ) -> tuple[torch.Tensor, d.Distribution, torch.Tensor]:
611
+
612
+ cur_log_prob, dist, is_composite = self._get_cur_log_prob(tensordict)
613
+
614
+ prev_log_prob = tensordict.get(
615
+ self.tensor_keys.sample_log_prob,
616
+ as_padded_tensor=True,
617
+ padding_side="left",
618
+ padding_value=0.0,
619
+ )
620
+
621
+ if prev_log_prob is None:
622
+ raise KeyError(
623
+ f"Couldn't find the log-prob {self.tensor_keys.sample_log_prob} in the input data."
624
+ )
625
+ if prev_log_prob.requires_grad:
626
+ raise RuntimeError(
627
+ f"tensordict stored {self.tensor_keys.sample_log_prob} requires grad."
628
+ )
629
+
630
+ # Check for shape mismatches and provide helpful error messages
631
+ if cur_log_prob.shape != prev_log_prob.shape:
632
+ # Try to provide helpful debugging information
633
+ error_msg = (
634
+ f"Shape mismatch detected in GRPOLoss: current log-prob shape {cur_log_prob.shape} "
635
+ f"!= previous log-prob shape {prev_log_prob.shape}. "
636
+ f"This usually indicates a mismatch between the masking strategy used for "
637
+ f"advantage computation and the masking strategy used for loss computation.\n"
638
+ f"Current masking strategy: '{self.masking_strategy}'\n"
639
+ f"Possible solutions:\n"
640
+ f"1. If using RLHF (multi-turn conversations), set masking_strategy='rlhf'\n"
641
+ f"2. If using SFT (single-turn conversations), set masking_strategy='sft'\n"
642
+ f"3. If using generic scenarios, set masking_strategy='generic'\n"
643
+ f"4. Ensure the advantage was computed with the same masking strategy as the loss"
644
+ )
645
+ raise ValueError(error_msg)
646
+
647
+ attention_mask = dist.mask
648
+ cur_log_prob = torch.where(
649
+ expand_as_right(attention_mask, cur_log_prob), cur_log_prob, 0.0
650
+ )
651
+ prev_log_prob = torch.where(
652
+ expand_as_right(attention_mask, prev_log_prob), prev_log_prob, 0.0
653
+ )
654
+
655
+ if is_composite:
656
+ raise NotImplementedError
657
+ log_weight = (cur_log_prob - prev_log_prob).unsqueeze(-1)
658
+ if is_tensor_collection(log_weight):
659
+ log_weight = _sum_td_features(log_weight)
660
+ log_weight = log_weight.view(adv_shape).unsqueeze(-1)
661
+
662
+ kl_approx = (prev_log_prob - cur_log_prob).unsqueeze(-1)
663
+ if is_tensor_collection(kl_approx):
664
+ kl_approx = _sum_td_features(kl_approx)
665
+
666
+ tensordict.set("_cur_log_prob", cur_log_prob)
667
+
668
+ return log_weight, dist, kl_approx
669
+
670
+
671
+ class DAPO(GRPOLoss):
672
+ """DAPO (Clip-Higher over GRPO).
673
+
674
+ Validates asymmetric clip thresholds; recommended (0.20, 0.28), see Eq. (10) in
675
+ the `DAPO <https://arxiv.org/html/2503.14476>`_ paper.
676
+ """
677
+
678
+ output_type: type[LLMLossOutput] = DAPOLossOutput
679
+
680
+ def __init__(
681
+ self,
682
+ tensordict: TensorDictBase,
683
+ key: NestedKey = ("next", "ref_log_prob"),
684
+ ref_log_prob: torch.Tensor | None = None,
685
+ coeff: float | None = None,
686
+ mask: torch.Tensor | None = None,
687
+ dist: d.Distribution | None = None,
688
+ ):
689
+ if coeff is None:
690
+ coeff = self.kl_to_ref_coeff
691
+ # TODO: customize this
692
+ if ref_log_prob is None:
693
+ ref_log_prob = tensordict.get(
694
+ key,
695
+ as_padded_tensor=True,
696
+ padding_side="left",
697
+ padding_value=0.0,
698
+ )
699
+ if ref_log_prob is None:
700
+ raise KeyError(
701
+ f"Couldn't find the ref log-prob {key} in the input data ({tensordict.keys(True)=})."
702
+ )
703
+ ref_log_prob = ref_log_prob.squeeze(-1)
704
+ cur_log_prob = tensordict.get("_cur_log_prob")
705
+ # TODO: remove this
706
+ if cur_log_prob.shape != ref_log_prob.shape:
707
+ raise ValueError(
708
+ f"cur_log_prob and ref_log_prob must have the same shape, got {cur_log_prob.shape=} and {ref_log_prob.shape=}"
709
+ )
710
+ if mask is not None:
711
+ ref_log_prob = torch.where(
712
+ expand_as_right(mask, ref_log_prob), ref_log_prob, 0.0
713
+ )
714
+ cur_log_prob = torch.where(
715
+ expand_as_right(mask, cur_log_prob), cur_log_prob, 0.0
716
+ )
717
+ diff = ref_log_prob - cur_log_prob
718
+ kl_penalty = (diff.expm1() - diff).mean()
719
+ return coeff * kl_penalty, kl_penalty
720
+
721
+
722
+ class CISPOLoss(GRPOLoss):
723
+ """CISPO (Clipped Importance Sampling Policy Optimization).
724
+
725
+ Inherits the GRPO pipeline (masking, ESS, entropy, optional KL penalties) but
726
+ replaces the PPO-style min with a clipped-importance objective::
727
+
728
+ loss = - clip(weight, [1 - eps_low, 1 + eps_high]) * advantage
729
+
730
+ See the `MiniMax-M1 (CISPO) <https://arxiv.org/html/2506.13585>`_ paper.
731
+ """
732
+
733
+ output_type: type[LLMLossOutput] = CISPOLossOutput
734
+
735
+ def _compute_policy_objective(
736
+ self, log_weight: torch.Tensor, advantage: torch.Tensor
737
+ ) -> tuple[torch.Tensor, torch.Tensor]:
738
+ # CISPO: use clipped importance weights directly
739
+ log_weight_clip = log_weight.clamp(*self._clip_bounds)
740
+ clip_fraction = (log_weight_clip != log_weight).to(log_weight.dtype).mean()
741
+ ratio = log_weight_clip.exp()
742
+ gain = ratio * advantage
743
+ return -gain, clip_fraction
744
+
745
+
746
+ class MCAdvantage(Transform):
747
+ """Monte-Carlo advantage computation engine.
748
+
749
+ When writing on a replay buffer, this transform keeps track of the existing trajectories with a similar
750
+ initial prompt and holds a queue for that particular prompt in memory.
751
+ When that queue hits a certain length, the advantage is computed by normalizing the rewards across all the
752
+ steps of all the trajectories.
753
+
754
+ This transform assumes that :meth:`~torchrl.data.ReplayBuffer.add` and :meth:`~torchrl.data.ReplayBuffer.extend`
755
+ are executed with completed trajectories (i.e., trajectories that end up with a done state). If this is not the
756
+ case, an exception is raised.
757
+
758
+ .. warning:: This transform will flatten the input tensordicts and therefore is not compatible yet with replay
759
+ buffers hosting storages of more than one dimension.
760
+
761
+ Args:
762
+ grpo_size (int): Number of trajectories to keep in memory for the advantage computation.
763
+ prompt_key (NestedKey): Key to the prompt in the tensordict. Defaults to ("text", "prompt").
764
+ rewards_key (NestedKey): Key to the rewards in the tensordict. Defaults to ("next", "reward").
765
+ advantage_key (NestedKey): Key to the advantage in the tensordict. Defaults to "advantage".
766
+ done_key (NestedKey): Key to the done state in the tensordict. Defaults to ("next", "done").
767
+ verbose (bool): Whether to print verbose information. Defaults to `False`.
768
+
769
+ """
770
+
771
+ def __init__(
772
+ self,
773
+ grpo_size: int,
774
+ prompt_key: NestedKey = "query",
775
+ rewards_key: NestedKey = ("next", "reward"),
776
+ advantage_key: NestedKey = "advantage",
777
+ done_key: NestedKey = ("next", "done"),
778
+ verbose: bool = False,
779
+ ):
780
+ super().__init__()
781
+ self.in_keys = [prompt_key, rewards_key, done_key]
782
+ self.out_keys = [advantage_key]
783
+ self.prompt_key = prompt_key
784
+ self.rewards_key = rewards_key
785
+ self.advantage_key = advantage_key
786
+ self.done_key = done_key
787
+ self.queues = defaultdict(lambda: deque(maxlen=grpo_size))
788
+ self.grpo_size = grpo_size
789
+ self.verbose = verbose
790
+
791
+ def forward(self, tensordict: TensorDictBase) -> GRPOLossOutput:
792
+ return tensordict
793
+
794
+ def _inv_call(self, tensordict: TensorDictBase) -> TensorDictBase:
795
+ if self.verbose:
796
+ torchrl_logger.info(
797
+ f"Invoking MCAdvantage.\nData size: {tensordict.shape}.\nCurrent queue size: {len(self.queues)}.\nTotal queue content: {sum(len(q) for q in self.queues.values())}"
798
+ )
799
+ # Tensordict can be any number of dims, but it must contain entire trajectories
800
+ if tensordict.ndim == 1:
801
+ # Check how many done states we have
802
+ num_done = tensordict[self.done_key].sum()
803
+ if num_done > 1:
804
+ done_idx = tensordict[self.done_key].nonzero(as_tuple=True)[0] + 1
805
+ splits = torch.cat([done_idx.new_zeros((1,)), done_idx], dim=0).diff()
806
+ tensordicts = tensordict.split(splits)
807
+ tensordicts = [self._inv_call(td) for td in tensordicts]
808
+ tensordicts = [td for td in tensordicts if td is not None]
809
+ return torch.cat(tensordicts) if tensordicts else None
810
+ # Then we have a single trajectory
811
+ if not tensordict[-1][self.done_key].all():
812
+ raise RuntimeError("Expected the trajectory to be done.")
813
+ prompt = tensordict[0][self.prompt_key]
814
+ if not isinstance(prompt, str):
815
+ raise TypeError(f"Expected a string as prompt, got {type(prompt)=}")
816
+ self.queues[prompt].append(tensordict)
817
+ if len(self.queues[prompt]) == self.grpo_size:
818
+ if self.verbose:
819
+ torchrl_logger.info(f"Computing advantage for {prompt=}")
820
+ # Cat is the most robust way to combine the trajs
821
+ tds = torch.cat(list(self.queues[prompt]), -1)
822
+ del self.queues[prompt]
823
+ # Collect rewards
824
+ reward = tds.get(self.rewards_key, as_nested_tensor=True)
825
+ reward_mean = reward.values().mean()
826
+ reward_scale = reward.values().std()
827
+ advantage = (reward - reward_mean) / reward_scale.clamp_min(1e-6)
828
+ if self.verbose:
829
+ torchrl_logger.info(f"Advantage: {reward_mean=} {reward_scale=}")
830
+ tds.set(self.advantage_key, advantage)
831
+ return tds
832
+ return
833
+ elif tensordict.ndim > 2:
834
+ # keep the time dim at the end
835
+ tensordict = tensordict.flatten(0, -2)
836
+ trajs = tensordict.unbind(0)
837
+ # Iterate over the trajectories
838
+ result = []
839
+ for traj in trajs:
840
+ td_out = self._inv_call(traj)
841
+ if td_out is None:
842
+ continue
843
+ result.append(td_out)
844
+ if result:
845
+ return torch.cat(result, 0)
846
+ return