liger-kernel 0.6.1__tar.gz → 0.6.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (301) hide show
  1. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/.github/workflows/amd-ci.yml +2 -4
  2. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/.github/workflows/benchmark.yml +10 -9
  3. liger_kernel-0.6.3/.github/workflows/docs.yml +64 -0
  4. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/.github/workflows/intel-ci.yml +0 -3
  5. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/.github/workflows/nvi-ci.yml +0 -3
  6. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/.gitignore +2 -1
  7. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/Makefile +9 -1
  8. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/PKG-INFO +14 -11
  9. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/README.md +10 -6
  10. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/data/all_benchmark_data.csv +129 -1
  11. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_cross_entropy.py +4 -1
  12. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +30 -16
  13. liger_kernel-0.6.3/benchmark/scripts/benchmark_grpo_loss.py +234 -0
  14. liger_kernel-0.6.3/benchmark/scripts/benchmark_llama4_rope.py +249 -0
  15. liger_kernel-0.6.3/benchmark/scripts/benchmark_poly_norm.py +197 -0
  16. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/dev/modal/tests.py +1 -1
  17. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/dev/modal/tests_bwd.py +1 -1
  18. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/docs/Examples.md +1 -1
  19. liger_kernel-0.6.3/docs/High-Level-APIs.md +93 -0
  20. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/docs/acknowledgement.md +0 -1
  21. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/alignment/run_orpo.py +1 -1
  22. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/training.py +2 -2
  23. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/training_multimodal.py +1 -1
  24. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/train.py +1 -1
  25. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/mkdocs.yml +27 -9
  26. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/pyproject.toml +29 -1
  27. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/setup.py +14 -5
  28. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/dpo_loss.py +54 -3
  29. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +4 -0
  30. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/grpo_loss.py +38 -4
  31. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/jsd_loss.py +5 -2
  32. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/cross_entropy.py +59 -53
  33. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/fused_linear_cross_entropy.py +83 -17
  34. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/layer_norm.py +4 -6
  35. liger_kernel-0.6.3/src/liger_kernel/ops/llama4_rope.py +225 -0
  36. liger_kernel-0.6.3/src/liger_kernel/ops/poly_norm.py +386 -0
  37. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/__init__.py +32 -0
  38. liger_kernel-0.6.3/src/liger_kernel/transformers/experimental/__init__.py +5 -0
  39. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/functional.py +9 -0
  40. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +8 -1
  41. liger_kernel-0.6.3/src/liger_kernel/transformers/llama4_rope.py +93 -0
  42. liger_kernel-0.6.3/src/liger_kernel/transformers/model/falcon_h1.py +108 -0
  43. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/gemma.py +2 -1
  44. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/gemma2.py +8 -2
  45. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/gemma3.py +27 -2
  46. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/glm4.py +2 -1
  47. liger_kernel-0.6.3/src/liger_kernel/transformers/model/glm4v.py +151 -0
  48. liger_kernel-0.6.3/src/liger_kernel/transformers/model/glm4v_moe.py +153 -0
  49. liger_kernel-0.6.3/src/liger_kernel/transformers/model/internvl.py +150 -0
  50. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/llama.py +2 -1
  51. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/llama4.py +2 -1
  52. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/llava.py +6 -2
  53. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/loss_utils.py +3 -0
  54. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/mistral.py +2 -1
  55. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/mixtral.py +8 -2
  56. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/mllama.py +6 -3
  57. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/olmo2.py +2 -1
  58. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/paligemma.py +19 -0
  59. liger_kernel-0.6.3/src/liger_kernel/transformers/model/phi3.py +113 -0
  60. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/qwen2.py +2 -1
  61. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/qwen2_5_vl.py +7 -2
  62. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/qwen2_vl.py +7 -2
  63. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/qwen3.py +2 -1
  64. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/qwen3_moe.py +8 -2
  65. liger_kernel-0.6.3/src/liger_kernel/transformers/model/qwen3_next.py +134 -0
  66. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/smollm3.py +2 -1
  67. liger_kernel-0.6.3/src/liger_kernel/transformers/model/smolvlm.py +158 -0
  68. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/monkey_patch.py +552 -23
  69. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/multi_token_attention.py +1 -1
  70. liger_kernel-0.6.3/src/liger_kernel/transformers/poly_norm.py +42 -0
  71. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/rms_norm.py +7 -0
  72. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel.egg-info/PKG-INFO +14 -11
  73. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel.egg-info/SOURCES.txt +17 -0
  74. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel.egg-info/requires.txt +3 -4
  75. liger_kernel-0.6.3/test/chunked_loss/test_dpo_loss.py +938 -0
  76. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/chunked_loss/test_grpo_loss.py +35 -4
  77. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/chunked_loss/test_jsd_loss.py +5 -2
  78. liger_kernel-0.6.3/test/conftest.py +11 -0
  79. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/convergence/bf16/test_mini_models.py +401 -2
  80. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/convergence/bf16/test_mini_models_multimodal.py +227 -5
  81. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/convergence/bf16/test_mini_models_with_logits.py +401 -1
  82. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/convergence/fp32/test_mini_models.py +391 -1
  83. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/convergence/fp32/test_mini_models_multimodal.py +217 -7
  84. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/convergence/fp32/test_mini_models_with_logits.py +389 -0
  85. liger_kernel-0.6.3/test/resources/fake_configs/HuggingFaceTB/SmolVLM2-256M-Video-Instruct/tokenizer_config.json +1192 -0
  86. liger_kernel-0.6.3/test/resources/fake_configs/OpenGVLab/InternVL3-1B-hf/tokenizer_config.json +307 -0
  87. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_cross_entropy.py +45 -0
  88. liger_kernel-0.6.3/test/transformers/test_fused_linear_cross_entropy.py +731 -0
  89. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_monkey_patch.py +522 -29
  90. liger_kernel-0.6.3/test/transformers/test_poly_norm.py +281 -0
  91. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/utils.py +109 -7
  92. liger_kernel-0.6.1/.github/workflows/docs.yml +0 -30
  93. liger_kernel-0.6.1/docs/High-Level-APIs.md +0 -30
  94. liger_kernel-0.6.1/src/liger_kernel/transformers/model/phi3.py +0 -263
  95. liger_kernel-0.6.1/test/chunked_loss/test_dpo_loss.py +0 -358
  96. liger_kernel-0.6.1/test/conftest.py +0 -8
  97. liger_kernel-0.6.1/test/transformers/test_fused_linear_cross_entropy.py +0 -347
  98. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  99. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  100. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/.github/pull_request_template.md +0 -0
  101. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/.github/workflows/publish-nightly.yml +0 -0
  102. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/.github/workflows/publish-release.yml +0 -0
  103. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/LICENSE +0 -0
  104. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/NOTICE +0 -0
  105. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/README.md +0 -0
  106. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/__init__.py +0 -0
  107. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/benchmarks_visualizer.py +0 -0
  108. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/__init__.py +0 -0
  109. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
  110. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
  111. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
  112. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
  113. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_dyt.py +0 -0
  114. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_embedding.py +0 -0
  115. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_fused_add_rms_norm.py +0 -0
  116. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
  117. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
  118. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_geglu.py +0 -0
  119. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_group_norm.py +0 -0
  120. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_jsd.py +0 -0
  121. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_kl_div.py +0 -0
  122. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_kto_loss.py +0 -0
  123. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_layer_norm.py +0 -0
  124. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
  125. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
  126. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
  127. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_rms_norm.py +0 -0
  128. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_rope.py +0 -0
  129. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
  130. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_softmax.py +0 -0
  131. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
  132. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_sparsemax.py +0 -0
  133. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_swiglu.py +0 -0
  134. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_tvd.py +0 -0
  135. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/benchmark/scripts/utils.py +0 -0
  136. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/dev/fmt-requirements.txt +0 -0
  137. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/dev/modal/benchmarks.py +0 -0
  138. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/docs/Getting-Started.md +0 -0
  139. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/docs/Low-Level-APIs.md +0 -0
  140. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/docs/contributing.md +0 -0
  141. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/docs/images/banner.GIF +0 -0
  142. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/docs/images/compose.gif +0 -0
  143. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/docs/images/e2e-memory.png +0 -0
  144. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/docs/images/e2e-tps.png +0 -0
  145. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/docs/images/logo-banner.png +0 -0
  146. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/docs/images/patch.gif +0 -0
  147. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/docs/images/post-training.png +0 -0
  148. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/docs/index.md +0 -0
  149. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/docs/license.md +0 -0
  150. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/alignment/accelerate_config.yaml +0 -0
  151. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/README.md +0 -0
  152. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/callback.py +0 -0
  153. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/config/fsdp_config.json +0 -0
  154. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/img/gemma_7b_mem.png +0 -0
  155. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/img/gemma_7b_tp.png +0 -0
  156. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/img/llama_mem_alloc.png +0 -0
  157. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/img/llama_tps.png +0 -0
  158. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
  159. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/img/qwen_tps.png +0 -0
  160. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/launch_on_modal.py +0 -0
  161. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/requirements.txt +0 -0
  162. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/run_benchmarks.sh +0 -0
  163. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/run_gemma.sh +0 -0
  164. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/run_llama.sh +0 -0
  165. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/run_qwen.sh +0 -0
  166. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/huggingface/run_qwen2_vl.sh +0 -0
  167. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/lightning/README.md +0 -0
  168. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/lightning/requirements.txt +0 -0
  169. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/lightning/training.py +0 -0
  170. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/README.md +0 -0
  171. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/callback.py +0 -0
  172. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
  173. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
  174. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
  175. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
  176. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
  177. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
  178. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
  179. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
  180. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
  181. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/medusa_util.py +0 -0
  182. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/requirements.txt +0 -0
  183. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
  184. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/licenses/LICENSE-Apache-2.0 +0 -0
  185. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/licenses/LICENSE-MIT-AutoAWQ +0 -0
  186. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
  187. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/licenses/LICENSE-MIT-llmc +0 -0
  188. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/licenses/LICENSE-MIT-triton +0 -0
  189. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/setup.cfg +0 -0
  190. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/__init__.py +0 -0
  191. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/README.md +0 -0
  192. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/__init__.py +0 -0
  193. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
  194. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
  195. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/functional.py +0 -0
  196. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
  197. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
  198. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
  199. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
  200. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
  201. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
  202. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/env_report.py +0 -0
  203. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/__init__.py +0 -0
  204. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/dyt.py +0 -0
  205. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  206. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
  207. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/fused_add_rms_norm.py +0 -0
  208. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
  209. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
  210. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/geglu.py +0 -0
  211. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/group_norm.py +0 -0
  212. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/grpo_loss.py +0 -0
  213. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/jsd.py +0 -0
  214. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/kl_div.py +0 -0
  215. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/multi_token_attention.py +0 -0
  216. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
  217. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/rms_norm.py +0 -0
  218. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/rope.py +0 -0
  219. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/softmax.py +0 -0
  220. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/sparsemax.py +0 -0
  221. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/swiglu.py +0 -0
  222. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/tvd.py +0 -0
  223. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/ops/utils.py +0 -0
  224. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/auto_model.py +0 -0
  225. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/cross_entropy.py +0 -0
  226. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/dyt.py +0 -0
  227. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  228. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/fsdp.py +0 -0
  229. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/fused_add_rms_norm.py +0 -0
  230. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
  231. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
  232. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/geglu.py +0 -0
  233. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/group_norm.py +0 -0
  234. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/grpo_loss.py +0 -0
  235. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/jsd.py +0 -0
  236. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/kl_div.py +0 -0
  237. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/layer_norm.py +0 -0
  238. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/model/__init__.py +0 -0
  239. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
  240. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/rope.py +0 -0
  241. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/softmax.py +0 -0
  242. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/sparsemax.py +0 -0
  243. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/swiglu.py +0 -0
  244. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
  245. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
  246. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  247. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/transformers/tvd.py +0 -0
  248. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/triton/__init__.py +0 -0
  249. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/triton/monkey_patch.py +0 -0
  250. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel/utils.py +0 -0
  251. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel.egg-info/dependency_links.txt +0 -0
  252. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/src/liger_kernel.egg-info/top_level.txt +0 -0
  253. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/__init__.py +0 -0
  254. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/chunked_loss/__init__.py +0 -0
  255. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/chunked_loss/test_cosine_loss.py +0 -0
  256. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/chunked_loss/test_cpo_loss.py +0 -0
  257. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/chunked_loss/test_kto_loss.py +0 -0
  258. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/chunked_loss/test_orpo_loss.py +0 -0
  259. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/chunked_loss/test_simpo_loss.py +0 -0
  260. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/convergence/__init__.py +0 -0
  261. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/convergence/bf16/__init__.py +0 -0
  262. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/convergence/fp32/__init__.py +0 -0
  263. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
  264. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
  265. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
  266. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
  267. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
  268. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
  269. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
  270. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
  271. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
  272. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
  273. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/resources/tiny_shakespeare.txt +0 -0
  274. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
  275. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
  276. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
  277. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_auto_model.py +0 -0
  278. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_dyt.py +0 -0
  279. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_embedding.py +0 -0
  280. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_flex_attention.py +0 -0
  281. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_fused_add_rms_norm.py +0 -0
  282. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_fused_linear_jsd.py +0 -0
  283. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_fused_neighborhood_attention.py +0 -0
  284. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_geglu.py +0 -0
  285. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_group_norm.py +0 -0
  286. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_grpo_loss.py +0 -0
  287. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_jsd.py +0 -0
  288. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_kl_div.py +0 -0
  289. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_layer_norm.py +0 -0
  290. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_mm_int8int2.py +0 -0
  291. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_multi_token_attention.py +0 -0
  292. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_qwen2vl_mrope.py +0 -0
  293. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_rms_norm.py +0 -0
  294. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_rope.py +0 -0
  295. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_softmax.py +0 -0
  296. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_sparsemax.py +0 -0
  297. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_swiglu.py +0 -0
  298. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_trainer_integration.py +0 -0
  299. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_transformers.py +0 -0
  300. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/transformers/test_tvd.py +0 -0
  301. {liger_kernel-0.6.1 → liger_kernel-0.6.3}/test/triton/test_triton_monkey_patch.py +0 -0
@@ -13,9 +13,6 @@ on:
13
13
  paths:
14
14
  - "src/**"
15
15
  - "test/**"
16
- schedule:
17
- # Runs at 00:00 UTC daily
18
- - cron: '0 0 * * *'
19
16
  workflow_dispatch: # Enables manual trigger
20
17
 
21
18
  concurrency:
@@ -64,7 +61,8 @@ jobs:
64
61
  run: |
65
62
  rocm-smi
66
63
  python -m pip install --upgrade pip
67
- pip install -e .[dev] --extra-index-url https://download.pytorch.org/whl/nightly/rocm${{ matrix.rocm_version }}
64
+ pip install -e .[dev]
65
+ pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm${{ matrix.rocm_version }}/
68
66
 
69
67
  - name: List Python Environments
70
68
  run: python -m pip list
@@ -39,24 +39,24 @@ jobs:
39
39
 
40
40
  steps:
41
41
  # Step: Decide the commit hash to use
42
+ # Step: Checkout full history so we can check out any commit
43
+ - name: Checkout full repo history
44
+ uses: actions/checkout@v3
45
+ with:
46
+ fetch-depth: 0 # Important: so we can checkout arbitrary commit
47
+
42
48
  - name: Determine commit hash to checkout
43
49
  id: choose_commit
44
50
  run: |
45
- if [ "${{ github.event.inputs.commit_hash }}" != "" ]; then
51
+ if [ "${{ github.event_name}}" == "workflow_dispatch" ] && [ "${{ github.event.inputs.commit_hash }}" != "main" ]; then
46
52
  echo "Using manual input commit: ${{ github.event.inputs.commit_hash }}"
47
53
  echo "hash=${{ github.event.inputs.commit_hash }}" >> $GITHUB_OUTPUT
48
54
  else
49
55
  echo "Using latest commit from main"
50
- git fetch origin main
51
- echo "hash=$(git rev-parse origin/main)" >> $GITHUB_OUTPUT
56
+ echo "hash=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
52
57
  fi
53
58
 
54
- # Step: Checkout full history so we can check out any commit
55
- - name: Checkout full repo history
56
- uses: actions/checkout@v3
57
- with:
58
- fetch-depth: 0 # Important: so we can checkout arbitrary commit
59
- # Step: Conditionally replace benchmark folder from main
59
+ # Step: Conditionally replace benchmark folder from main
60
60
  - name: Replace benchmark folder from main (manual only, commit ≠ main)
61
61
  if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.commit_hash != 'main' }}
62
62
  run: |
@@ -133,6 +133,7 @@ jobs:
133
133
  echo "Not a release event"
134
134
  path=${{steps.choose_commit.outputs.hash}}
135
135
  fi
136
+ echo "path=$path" >> $GITHUB_OUTPUT
136
137
  COMMIT_DIR="gh-pages/${OUTPUT_DIR}/${path}"
137
138
 
138
139
  mkdir -p "$COMMIT_DIR"
@@ -0,0 +1,64 @@
1
+ name: Publish documentation
2
+ on:
3
+ push:
4
+ branches:
5
+ - main
6
+ paths:
7
+ - 'docs/**'
8
+ - 'mkdocs.yml'
9
+
10
+ permissions:
11
+ contents: write
12
+ jobs:
13
+ deploy:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - name: Configure Git Credentials
18
+ run: |
19
+ git config user.name github-actions[bot]
20
+ git config user.email 41898282+github-actions[bot]@users.noreply.github.com
21
+ - uses: actions/setup-python@v5
22
+ with:
23
+ python-version: 3.x
24
+ - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
25
+ - uses: actions/cache@v4
26
+ with:
27
+ key: mkdocs-material-${{ env.cache_id }}
28
+ path: .cache
29
+ restore-keys: |
30
+ mkdocs-material-
31
+ - run: pip install mkdocs-material mkdocstrings[python]
32
+ # ====== Backup the benchmarks from gh-pages ======
33
+ # This is necessary because the benchmarks are not included in the documentation build process.
34
+ # So we need to backup the benchmarks from gh-pages and restore them after the documentation is built.
35
+ - name: Backup benchmarks from gh-pages
36
+ run: |
37
+ git fetch origin gh-pages
38
+ # create worktree bound to local gh-pages, tracking origin/gh-pages
39
+ git branch -f gh-pages origin/gh-pages || true
40
+ mkdir -p ghp && git worktree add ghp gh-pages || true
41
+ if [ -d ghp/benchmarks ]; then
42
+ tar -C ghp -czf /tmp/benchmarks.tgz benchmarks
43
+ fi
44
+ # IMPORTANT: remove worktree so gh-pages isn't checked out anywhere
45
+ git worktree remove ghp --force || true
46
+ echo "Backed up benchmarks from gh-pages"
47
+ # ====== Deploy the documentation ======
48
+ - name: Deploy documentation
49
+ run: mkdocs gh-deploy --force
50
+ # ====== Restore the benchmarks onto gh-pages ======
51
+ # This is necessary because the benchmarks are not included in the documentation build process.
52
+ # So we need to restore the benchmarks onto gh-pages after the documentation is built.
53
+ - name: Restore benchmarks onto gh-pages
54
+ run: |
55
+ # Refresh remote tracking and recreate a clean worktree
56
+ git fetch origin gh-pages
57
+ git worktree add -B gh-pages ghp origin/gh-pages
58
+ if [ -f /tmp/benchmarks.tgz ]; then
59
+ tar -C ghp -xzf /tmp/benchmarks.tgz
60
+ git -C ghp add -A
61
+ git -C ghp commit -m "Restore benchmarks after gh-deploy" || echo "No changes"
62
+ git -C ghp push origin gh-pages
63
+ fi
64
+ git worktree remove ghp --force || true
@@ -13,9 +13,6 @@ on:
13
13
  paths:
14
14
  - "src/**"
15
15
  - "test/**"
16
- schedule:
17
- # Runs at 00:00 UTC daily
18
- - cron: '0 0 * * *'
19
16
  workflow_dispatch: # Enables manual trigger
20
17
 
21
18
  concurrency:
@@ -13,9 +13,6 @@ on:
13
13
  paths:
14
14
  - "src/**"
15
15
  - "test/**"
16
- schedule:
17
- # Runs at 00:00 UTC daily
18
- - cron: '0 0 * * *'
19
16
  workflow_dispatch: # Enables manual trigger
20
17
 
21
18
  concurrency:
@@ -23,4 +23,5 @@ uv.lock
23
23
 
24
24
  # Benchmark images
25
25
  benchmark/visualizations
26
- .vscode/
26
+ .vscode/
27
+ .coverage
@@ -5,7 +5,15 @@ all: checkstyle test test-convergence
5
5
 
6
6
  # Command to run pytest for correctness tests
7
7
  test:
8
- python -m pytest --disable-warnings test/ --ignore=test/convergence
8
+ python -m pytest --disable-warnings \
9
+ --cov=src/liger_kernel \
10
+ --cov-report=term-missing \
11
+ --ignore=test/convergence \
12
+ test/
13
+
14
+ # Command to run coverage report
15
+ coverage:
16
+ coverage report -m
9
17
 
10
18
  # Command to run ruff for linting and formatting code
11
19
  checkstyle:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: liger_kernel
3
- Version: 0.6.1
3
+ Version: 0.6.3
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -35,15 +35,14 @@ Requires-Dist: triton>=2.3.1
35
35
  Provides-Extra: dev
36
36
  Requires-Dist: transformers>=4.49.0; extra == "dev"
37
37
  Requires-Dist: matplotlib>=3.7.2; extra == "dev"
38
- Requires-Dist: flake8>=4.0.1.1; extra == "dev"
39
- Requires-Dist: black>=24.4.2; extra == "dev"
40
- Requires-Dist: isort>=5.13.2; extra == "dev"
38
+ Requires-Dist: ruff>=0.12.0; extra == "dev"
41
39
  Requires-Dist: pytest>=7.1.2; extra == "dev"
42
40
  Requires-Dist: pytest-xdist; extra == "dev"
41
+ Requires-Dist: pytest-cov; extra == "dev"
42
+ Requires-Dist: pytest-asyncio; extra == "dev"
43
43
  Requires-Dist: pytest-rerunfailures; extra == "dev"
44
44
  Requires-Dist: datasets>=2.19.2; extra == "dev"
45
45
  Requires-Dist: seaborn; extra == "dev"
46
- Requires-Dist: mkdocs; extra == "dev"
47
46
  Requires-Dist: mkdocs-material; extra == "dev"
48
47
  Requires-Dist: torchvision>=0.20; extra == "dev"
49
48
  Dynamic: license-file
@@ -181,8 +180,8 @@ y = orpo_loss(lm_head.weight, x, target)
181
180
  - `triton >= 3.0.0` Install from pypi. (e.g. `pip install triton==3.0.0`)
182
181
 
183
182
  ```bash
184
- # Need to pass the url when installing
185
- pip install -e .[dev] --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2
183
+ pip install -e .[dev]
184
+ pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/
186
185
  ```
187
186
 
188
187
  ### Optional Dependencies
@@ -216,6 +215,9 @@ pip install -e .
216
215
 
217
216
  # Setup Development Dependencies
218
217
  pip install -e ".[dev]"
218
+
219
+ # NOTE -> For AMD users only
220
+ pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/
219
221
  ```
220
222
 
221
223
 
@@ -312,6 +314,7 @@ loss.backward()
312
314
  | Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
313
315
  | OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
314
316
  | GLM-4 | `liger_kernel.transformers.apply_liger_kernel_to_glm4` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
317
+ | InternVL3 | `liger_kernel.transformers.apply_liger_kernel_to_internvl` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
315
318
 
316
319
 
317
320
  ## Low-level APIs
@@ -391,17 +394,17 @@ loss.backward()
391
394
  <td style="padding: 10px;">
392
395
  <div style="display: block;">
393
396
  <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/nvi-ci.yml">
394
- <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/nvi-ci.yml/badge.svg?event=schedule" alt="Build">
397
+ <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/nvi-ci.yml/badge.svg?branch=main&event=push" alt="Build">
395
398
  </a>
396
399
  </div>
397
400
  <div style="display: block;">
398
401
  <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml">
399
- <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml/badge.svg?event=schedule" alt="Build">
402
+ <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml/badge.svg?branch=main&event=push" alt="Build">
400
403
  </a>
401
404
  </div>
402
405
  <div style="display: block;">
403
- <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml">
404
- <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml/badge.svg?event=schedule" alt="Build">
406
+ <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml">
407
+ <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml/badge.svg?branch=main&event=push" alt="Build">
405
408
  </a>
406
409
  </div>
407
410
  </td>
@@ -129,8 +129,8 @@ y = orpo_loss(lm_head.weight, x, target)
129
129
  - `triton >= 3.0.0` Install from pypi. (e.g. `pip install triton==3.0.0`)
130
130
 
131
131
  ```bash
132
- # Need to pass the url when installing
133
- pip install -e .[dev] --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2
132
+ pip install -e .[dev]
133
+ pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/
134
134
  ```
135
135
 
136
136
  ### Optional Dependencies
@@ -164,6 +164,9 @@ pip install -e .
164
164
 
165
165
  # Setup Development Dependencies
166
166
  pip install -e ".[dev]"
167
+
168
+ # NOTE -> For AMD users only
169
+ pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/
167
170
  ```
168
171
 
169
172
 
@@ -260,6 +263,7 @@ loss.backward()
260
263
  | Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
261
264
  | OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
262
265
  | GLM-4 | `liger_kernel.transformers.apply_liger_kernel_to_glm4` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
266
+ | InternVL3 | `liger_kernel.transformers.apply_liger_kernel_to_internvl` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
263
267
 
264
268
 
265
269
  ## Low-level APIs
@@ -339,17 +343,17 @@ loss.backward()
339
343
  <td style="padding: 10px;">
340
344
  <div style="display: block;">
341
345
  <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/nvi-ci.yml">
342
- <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/nvi-ci.yml/badge.svg?event=schedule" alt="Build">
346
+ <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/nvi-ci.yml/badge.svg?branch=main&event=push" alt="Build">
343
347
  </a>
344
348
  </div>
345
349
  <div style="display: block;">
346
350
  <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml">
347
- <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml/badge.svg?event=schedule" alt="Build">
351
+ <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml/badge.svg?branch=main&event=push" alt="Build">
348
352
  </a>
349
353
  </div>
350
354
  <div style="display: block;">
351
- <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml">
352
- <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml/badge.svg?event=schedule" alt="Build">
355
+ <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml">
356
+ <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml/badge.svg?branch=main&event=push" alt="Build">
353
357
  </a>
354
358
  </div>
355
359
  </td>
@@ -1574,4 +1574,132 @@ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,2048,208.06298828
1574
1574
  fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,4096,416.11767578125,416.11767578125,416.11767578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1575
1575
  fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,8192,832.22705078125,832.22705078125,832.22705078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1576
1576
  fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,16384,1544.44580078125,1544.44580078125,1544.44580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1577
- fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1577
+ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1578
+ fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,2,40.75366401672363,40.749671173095706,40.75765686035156,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
1579
+ fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,4,80.95231628417969,80.95231628417969,80.95231628417969,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
1580
+ fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,8,163.58604431152344,163.58604431152344,163.58604431152344,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
1581
+ fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,16,323.6761474609375,323.6761474609375,323.6761474609375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
1582
+ fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,2,23.71225643157959,23.612825775146483,23.8354434967041,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
1583
+ fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,4,46.86131286621094,46.80355911254883,46.91906661987304,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
1584
+ fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,8,94.54898834228516,94.54898834228516,94.54898834228516,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
1585
+ fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,16,189.99501037597656,189.99501037597656,189.99501037597656,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
1586
+ fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,2,42.67263984680176,42.54085083007813,42.80442886352539,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
1587
+ fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,4,82.2446060180664,82.2446060180664,82.2446060180664,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
1588
+ fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,8,167.00416564941406,167.00416564941406,167.00416564941406,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
1589
+ fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,16,327.0911865234375,327.0911865234375,327.0911865234375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
1590
+ fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,2,45.36115264892578,45.241344451904304,45.480960845947266,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
1591
+ fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,4,90.00038146972656,90.00038146972656,90.00038146972656,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
1592
+ fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,8,177.22674560546875,177.22674560546875,177.22674560546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
1593
+ fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,16,356.5383605957031,356.5383605957031,356.5383605957031,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
1594
+ fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,2,1.814527988433838,1.8124799728393555,1.8167808055877686,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
1595
+ fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,4,1.84934401512146,1.8472959995269775,1.8524160385131836,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
1596
+ fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,8,1.891327977180481,1.8872319459915161,1.893990397453308,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
1597
+ fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,16,1.9722239971160889,1.9660799503326416,1.9763200283050537,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
1598
+ fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,2,22.014975547790527,21.710438537597657,22.19417533874512,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
1599
+ fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,4,41.83603096008301,41.752165222167974,41.91989669799805,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
1600
+ fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,8,81.66400146484375,81.66400146484375,81.66400146484375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
1601
+ fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,16,162.6429443359375,162.6429443359375,162.6429443359375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
1602
+ fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,2,7344.77685546875,7344.77685546875,7344.77685546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
1603
+ fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,4,7408.80029296875,7408.80029296875,7408.80029296875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
1604
+ fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,8,7536.84716796875,7536.84716796875,7536.84716796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
1605
+ fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,16,7792.94091796875,7792.94091796875,7792.94091796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
1606
+ fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,2,9083.28125,9083.28125,9083.28125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
1607
+ fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,4,13138.3125,13138.3125,13138.3125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
1608
+ fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,8,21250.375,21250.375,21250.375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
1609
+ fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,16,37474.5,37474.5,37474.5,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
1610
+ fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,2,40.72038269042969,40.71178131103516,40.728984069824214,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
1611
+ fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,4,81.69369506835938,81.69369506835938,81.69369506835938,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
1612
+ fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,8,162.79653930664062,162.79653930664062,162.79653930664062,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
1613
+ fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,16,323.6546630859375,323.6546630859375,323.6546630859375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
1614
+ fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,2,23.70047950744629,23.628594589233398,23.732429122924806,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
1615
+ fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,4,47.36921691894531,47.085364532470706,47.65306930541992,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
1616
+ fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,8,94.83366394042969,94.83366394042969,94.83366394042969,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
1617
+ fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,16,190.0963897705078,190.0963897705078,190.0963897705078,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
1618
+ fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,2,42.318336486816406,42.15214080810547,42.48453216552734,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
1619
+ fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,4,82.4616928100586,82.4616928100586,82.4616928100586,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
1620
+ fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,8,163.43756103515625,163.43756103515625,163.43756103515625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
1621
+ fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,16,325.4384765625,325.4384765625,325.4384765625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
1622
+ fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,2,45.99193572998047,45.80761489868165,46.176256561279295,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
1623
+ fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,4,88.57190704345703,88.57190704345703,88.57190704345703,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
1624
+ fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,8,176.94105529785156,176.94105529785156,176.94105529785156,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
1625
+ fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,16,356.0478820800781,356.0478820800781,356.0478820800781,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
1626
+ fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,2,1.8242560029029846,1.8102271556854248,1.8309119939804077,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
1627
+ fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,4,1.84934401512146,1.846886396408081,1.8534400463104248,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
1628
+ fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,8,1.891327977180481,1.8892799615859985,1.8933759927749634,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
1629
+ fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,16,1.9752960205078125,1.9722239971160889,1.977344036102295,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
1630
+ fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,2,22.0262393951416,21.80997085571289,22.20482559204102,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
1631
+ fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,4,41.54521560668945,41.224806213378905,41.865625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
1632
+ fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,8,81.21753692626953,81.21753692626953,81.21753692626953,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
1633
+ fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,16,160.82022094726562,160.82022094726562,160.82022094726562,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
1634
+ fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,2,7344.77685546875,7344.77685546875,7344.77685546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
1635
+ fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,4,7408.80029296875,7408.80029296875,7408.80029296875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
1636
+ fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,8,7536.84716796875,7536.84716796875,7536.84716796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
1637
+ fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,16,7792.94091796875,7792.94091796875,7792.94091796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
1638
+ fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,2,9083.28125,9083.28125,9083.28125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
1639
+ fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,4,13138.3125,13138.3125,13138.3125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
1640
+ fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,8,21250.375,21250.375,21250.375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
1641
+ fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,16,37474.5,37474.5,37474.5,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
1642
+ llama4_rope,liger,forward,speed,ms,H,hidden size,512,0.08249600231647491,0.08102399855852127,0.08432000130414963,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
1643
+ llama4_rope,liger,forward,speed,ms,H,hidden size,2048,0.08169600367546082,0.08037760108709335,0.08329600095748901,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
1644
+ llama4_rope,liger,forward,speed,ms,H,hidden size,8192,0.08128000050783157,0.07980799674987793,0.08329600095748901,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
1645
+ llama4_rope,huggingface,forward,speed,ms,H,hidden size,512,0.03759999945759773,0.03612799942493439,0.03907199949026108,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:03,0.6.1
1646
+ llama4_rope,huggingface,forward,speed,ms,H,hidden size,2048,0.06185600161552429,0.061267200857400894,0.06252799928188324,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:03,0.6.1
1647
+ llama4_rope,huggingface,forward,speed,ms,H,hidden size,8192,0.206496000289917,0.20582400262355804,0.20716799795627594,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:03,0.6.1
1648
+ llama4_rope,liger,backward,speed,ms,H,hidden size,512,0.15404799580574036,0.15241600573062897,0.15615999698638916,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:04,0.6.1
1649
+ llama4_rope,liger,backward,speed,ms,H,hidden size,2048,0.1536320000886917,0.15190400183200836,0.1558080017566681,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:04,0.6.1
1650
+ llama4_rope,liger,backward,speed,ms,H,hidden size,8192,0.15263999998569489,0.15094399452209473,0.15491199493408203,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:04,0.6.1
1651
+ llama4_rope,huggingface,backward,speed,ms,H,hidden size,512,0.13760000467300415,0.13574400544166565,0.14009599387645721,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:05,0.6.1
1652
+ llama4_rope,huggingface,backward,speed,ms,H,hidden size,2048,0.13600000739097595,0.13449600338935852,0.1382720023393631,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:05,0.6.1
1653
+ llama4_rope,huggingface,backward,speed,ms,H,hidden size,8192,0.21011200547218323,0.20924800634384155,0.21110400557518005,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:05,0.6.1
1654
+ llama4_rope,liger,full,speed,ms,H,hidden size,512,0.3652159869670868,0.3619840145111084,0.3699840009212494,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:07,0.6.1
1655
+ llama4_rope,liger,full,speed,ms,H,hidden size,2048,0.3599040061235428,0.2881920039653778,0.36559998989105225,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:07,0.6.1
1656
+ llama4_rope,liger,full,speed,ms,H,hidden size,8192,0.2874239981174469,0.2852480113506317,0.29029120206832887,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:07,0.6.1
1657
+ llama4_rope,huggingface,full,speed,ms,H,hidden size,512,0.24691200256347656,0.24489599466323853,0.24961919784545897,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1658
+ llama4_rope,huggingface,full,speed,ms,H,hidden size,2048,0.24774399399757385,0.24582399427890778,0.2505407989025116,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1659
+ llama4_rope,huggingface,full,speed,ms,H,hidden size,8192,0.41414400935173035,0.41337600350379944,0.41491198539733887,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1660
+ llama4_rope,liger,full,memory,MB,H,hidden size,512,37.23486328125,37.23486328125,37.23486328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1661
+ llama4_rope,liger,full,memory,MB,H,hidden size,2048,52.89111328125,52.89111328125,52.89111328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1662
+ llama4_rope,liger,full,memory,MB,H,hidden size,8192,115.51611328125,115.51611328125,115.51611328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1663
+ llama4_rope,huggingface,full,memory,MB,H,hidden size,512,49.64111328125,49.64111328125,49.64111328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1664
+ llama4_rope,huggingface,full,memory,MB,H,hidden size,2048,102.51611328125,102.51611328125,102.51611328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1665
+ llama4_rope,huggingface,full,memory,MB,H,hidden size,8192,314.01611328125,314.01611328125,314.01611328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1666
+ llama4_rope,liger,forward,speed,ms,T,sequence length,1024,0.07417599856853485,0.07248000055551529,0.07596799731254578,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
1667
+ llama4_rope,liger,forward,speed,ms,T,sequence length,2048,0.08182399719953537,0.08006399869918823,0.08380799740552902,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
1668
+ llama4_rope,liger,forward,speed,ms,T,sequence length,4096,0.11708799749612808,0.1167680025100708,0.11744000017642975,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
1669
+ llama4_rope,liger,forward,speed,ms,T,sequence length,8192,0.2165440022945404,0.21596799790859222,0.21715199947357178,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
1670
+ llama4_rope,liger,forward,speed,ms,T,sequence length,16384,0.41756799817085266,0.41705599427223206,0.41811200976371765,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
1671
+ llama4_rope,huggingface,forward,speed,ms,T,sequence length,1024,0.11644800007343292,0.11590400338172913,0.11708799749612808,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
1672
+ llama4_rope,huggingface,forward,speed,ms,T,sequence length,2048,0.20659199357032776,0.20608000457286835,0.2072640061378479,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
1673
+ llama4_rope,huggingface,forward,speed,ms,T,sequence length,4096,0.38553598523139954,0.3846847891807556,0.38624000549316406,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
1674
+ llama4_rope,huggingface,forward,speed,ms,T,sequence length,8192,0.7411519885063171,0.7403839826583862,0.7420480251312256,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
1675
+ llama4_rope,huggingface,forward,speed,ms,T,sequence length,16384,1.4553920030593872,1.4543871641159059,1.4562879800796509,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
1676
+ llama4_rope,liger,backward,speed,ms,T,sequence length,1024,0.11840000003576279,0.11711999773979187,0.12031999975442886,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
1677
+ llama4_rope,liger,backward,speed,ms,T,sequence length,2048,0.12336000055074692,0.12198399752378464,0.12489599734544754,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
1678
+ llama4_rope,liger,backward,speed,ms,T,sequence length,4096,0.12380799651145935,0.12240000069141388,0.12559999525547028,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
1679
+ llama4_rope,liger,backward,speed,ms,T,sequence length,8192,0.2170879989862442,0.2165759950876236,0.21753600239753723,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
1680
+ llama4_rope,liger,backward,speed,ms,T,sequence length,16384,0.4175359904766083,0.41705599427223206,0.4181375920772552,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
1681
+ llama4_rope,huggingface,backward,speed,ms,T,sequence length,1024,0.1189119964838028,0.11769600212574005,0.12003199756145477,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
1682
+ llama4_rope,huggingface,backward,speed,ms,T,sequence length,2048,0.21011200547218323,0.20927999913692474,0.21119999885559082,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
1683
+ llama4_rope,huggingface,backward,speed,ms,T,sequence length,4096,0.39740800857543945,0.3963199853897095,0.39824000000953674,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
1684
+ llama4_rope,huggingface,backward,speed,ms,T,sequence length,8192,0.7540159821510315,0.7528960108757019,0.7550719976425171,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
1685
+ llama4_rope,huggingface,backward,speed,ms,T,sequence length,16384,1.4822720289230347,1.4810559749603271,1.4833600521087646,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
1686
+ llama4_rope,liger,full,speed,ms,T,sequence length,1024,0.2874400019645691,0.2853440046310425,0.29052799940109253,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
1687
+ llama4_rope,liger,full,speed,ms,T,sequence length,2048,0.28646400570869446,0.2845759987831116,0.28963199257850647,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
1688
+ llama4_rope,liger,full,speed,ms,T,sequence length,4096,0.29897600412368774,0.29660800099372864,0.302131199836731,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
1689
+ llama4_rope,liger,full,speed,ms,T,sequence length,8192,0.4315840005874634,0.4304639995098114,0.43270400166511536,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
1690
+ llama4_rope,liger,full,speed,ms,T,sequence length,16384,0.833184003829956,0.8322240114212036,0.8345024228096007,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
1691
+ llama4_rope,huggingface,full,speed,ms,T,sequence length,1024,0.24592000246047974,0.24396799504756927,0.24876800179481506,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1692
+ llama4_rope,huggingface,full,speed,ms,T,sequence length,2048,0.4138239920139313,0.41308799386024475,0.4145599901676178,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1693
+ llama4_rope,huggingface,full,speed,ms,T,sequence length,4096,0.7800959944725037,0.7790719866752625,0.7810239791870117,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1694
+ llama4_rope,huggingface,full,speed,ms,T,sequence length,8192,1.4911680221557617,1.4902976036071778,1.4922879934310913,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1695
+ llama4_rope,huggingface,full,speed,ms,T,sequence length,16384,2.9344160556793213,2.9333438873291016,2.9353599548339844,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1696
+ llama4_rope,liger,full,memory,MB,T,sequence length,1024,73.75830078125,73.75830078125,73.75830078125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1697
+ llama4_rope,liger,full,memory,MB,T,sequence length,2048,115.51611328125,115.51611328125,115.51611328125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1698
+ llama4_rope,liger,full,memory,MB,T,sequence length,4096,199.03173828125,199.03173828125,199.03173828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1699
+ llama4_rope,liger,full,memory,MB,T,sequence length,8192,366.06298828125,366.06298828125,366.06298828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1700
+ llama4_rope,liger,full,memory,MB,T,sequence length,16384,700.12548828125,700.12548828125,700.12548828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1701
+ llama4_rope,huggingface,full,memory,MB,T,sequence length,1024,173.00830078125,173.00830078125,173.00830078125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1702
+ llama4_rope,huggingface,full,memory,MB,T,sequence length,2048,314.01611328125,314.01611328125,314.01611328125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1703
+ llama4_rope,huggingface,full,memory,MB,T,sequence length,4096,596.03173828125,596.03173828125,596.03173828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1704
+ llama4_rope,huggingface,full,memory,MB,T,sequence length,8192,1160.06298828125,1160.06298828125,1160.06298828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1705
+ llama4_rope,huggingface,full,memory,MB,T,sequence length,16384,2288.12548828125,2288.12548828125,2288.12548828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
@@ -70,6 +70,9 @@ def bench_speed_cross_entropy(
70
70
 
71
71
  if mode == "forward":
72
72
  ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, rep=100, quantiles=QUANTILES)
73
+ elif mode == "no-grad-forward":
74
+ with torch.no_grad():
75
+ ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, rep=100, quantiles=QUANTILES)
73
76
  elif mode == "backward":
74
77
  y = fwd()
75
78
 
@@ -109,7 +112,7 @@ if __name__ == "__main__":
109
112
 
110
113
  run_benchmarks(
111
114
  bench_test_fn=bench_speed_cross_entropy,
112
- kernel_operation_modes=["forward", "backward", "full"],
115
+ kernel_operation_modes=["forward", "backward", "full", "no-grad-forward"],
113
116
  metric_name="speed",
114
117
  metric_unit="ms",
115
118
  **common_configs,