liger-kernel 0.6.0__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (279) hide show
  1. liger_kernel-0.6.1/.github/workflows/benchmark.yml +167 -0
  2. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/PKG-INFO +11 -13
  3. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/README.md +10 -12
  4. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/data/all_benchmark_data.csv +112 -30
  5. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_cpo_loss.py +14 -8
  6. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_dpo_loss.py +14 -16
  7. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_embedding.py +8 -0
  8. liger_kernel-0.6.1/benchmark/scripts/benchmark_fused_add_rms_norm.py +201 -0
  9. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_orpo_loss.py +14 -8
  10. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_simpo_loss.py +14 -8
  11. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/dev/modal/benchmarks.py +1 -1
  12. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/index.md +8 -10
  13. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/pyproject.toml +1 -1
  14. liger_kernel-0.6.1/src/liger_kernel/ops/fused_add_rms_norm.py +412 -0
  15. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/layer_norm.py +126 -89
  16. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/rms_norm.py +2 -2
  17. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/rope.py +1 -1
  18. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/__init__.py +5 -0
  19. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/functional.py +5 -0
  20. liger_kernel-0.6.1/src/liger_kernel/transformers/fused_add_rms_norm.py +39 -0
  21. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/gemma3.py +1 -1
  22. liger_kernel-0.6.1/src/liger_kernel/transformers/model/smollm3.py +189 -0
  23. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/monkey_patch.py +85 -12
  24. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel.egg-info/PKG-INFO +11 -13
  25. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel.egg-info/SOURCES.txt +5 -0
  26. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/bf16/test_mini_models.py +64 -0
  27. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/bf16/test_mini_models_with_logits.py +63 -0
  28. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/fp32/test_mini_models.py +61 -0
  29. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/fp32/test_mini_models_with_logits.py +61 -0
  30. liger_kernel-0.6.1/test/transformers/test_fused_add_rms_norm.py +219 -0
  31. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_layer_norm.py +3 -0
  32. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_monkey_patch.py +52 -0
  33. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/utils.py +12 -0
  34. liger_kernel-0.6.0/.github/workflows/benchmark.yml +0 -93
  35. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  36. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  37. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/pull_request_template.md +0 -0
  38. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/workflows/amd-ci.yml +0 -0
  39. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/workflows/docs.yml +0 -0
  40. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/workflows/intel-ci.yml +0 -0
  41. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/workflows/nvi-ci.yml +0 -0
  42. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/workflows/publish-nightly.yml +0 -0
  43. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/workflows/publish-release.yml +0 -0
  44. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.gitignore +0 -0
  45. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/LICENSE +0 -0
  46. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/Makefile +0 -0
  47. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/NOTICE +0 -0
  48. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/README.md +0 -0
  49. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/__init__.py +0 -0
  50. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/benchmarks_visualizer.py +0 -0
  51. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/__init__.py +0 -0
  52. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
  53. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
  54. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
  55. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_dyt.py +0 -0
  56. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
  57. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
  58. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
  59. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_geglu.py +0 -0
  60. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_group_norm.py +0 -0
  61. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_jsd.py +0 -0
  62. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_kl_div.py +0 -0
  63. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_kto_loss.py +0 -0
  64. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_layer_norm.py +0 -0
  65. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
  66. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
  67. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_rms_norm.py +0 -0
  68. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_rope.py +0 -0
  69. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_softmax.py +0 -0
  70. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
  71. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_sparsemax.py +0 -0
  72. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_swiglu.py +0 -0
  73. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_tvd.py +0 -0
  74. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/utils.py +0 -0
  75. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/dev/fmt-requirements.txt +0 -0
  76. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/dev/modal/tests.py +0 -0
  77. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/dev/modal/tests_bwd.py +0 -0
  78. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/Examples.md +0 -0
  79. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/Getting-Started.md +0 -0
  80. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/High-Level-APIs.md +0 -0
  81. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/Low-Level-APIs.md +0 -0
  82. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/acknowledgement.md +0 -0
  83. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/contributing.md +0 -0
  84. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/images/banner.GIF +0 -0
  85. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/images/compose.gif +0 -0
  86. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/images/e2e-memory.png +0 -0
  87. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/images/e2e-tps.png +0 -0
  88. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/images/logo-banner.png +0 -0
  89. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/images/patch.gif +0 -0
  90. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/images/post-training.png +0 -0
  91. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/license.md +0 -0
  92. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/alignment/accelerate_config.yaml +0 -0
  93. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/alignment/run_orpo.py +0 -0
  94. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/README.md +0 -0
  95. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/callback.py +0 -0
  96. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/config/fsdp_config.json +0 -0
  97. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/img/gemma_7b_mem.png +0 -0
  98. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/img/gemma_7b_tp.png +0 -0
  99. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/img/llama_mem_alloc.png +0 -0
  100. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/img/llama_tps.png +0 -0
  101. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
  102. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/img/qwen_tps.png +0 -0
  103. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/launch_on_modal.py +0 -0
  104. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/requirements.txt +0 -0
  105. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/run_benchmarks.sh +0 -0
  106. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/run_gemma.sh +0 -0
  107. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/run_llama.sh +0 -0
  108. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/run_qwen.sh +0 -0
  109. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/run_qwen2_vl.sh +0 -0
  110. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/training.py +0 -0
  111. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/training_multimodal.py +0 -0
  112. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/lightning/README.md +0 -0
  113. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/lightning/requirements.txt +0 -0
  114. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/lightning/training.py +0 -0
  115. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/README.md +0 -0
  116. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/callback.py +0 -0
  117. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
  118. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
  119. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
  120. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
  121. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
  122. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
  123. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
  124. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
  125. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
  126. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/medusa_util.py +0 -0
  127. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/requirements.txt +0 -0
  128. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
  129. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/train.py +0 -0
  130. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/licenses/LICENSE-Apache-2.0 +0 -0
  131. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/licenses/LICENSE-MIT-AutoAWQ +0 -0
  132. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
  133. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/licenses/LICENSE-MIT-llmc +0 -0
  134. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/licenses/LICENSE-MIT-triton +0 -0
  135. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/mkdocs.yml +0 -0
  136. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/setup.cfg +0 -0
  137. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/setup.py +0 -0
  138. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/__init__.py +0 -0
  139. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/README.md +0 -0
  140. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/__init__.py +0 -0
  141. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
  142. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
  143. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
  144. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/functional.py +0 -0
  145. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
  146. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
  147. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
  148. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
  149. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
  150. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
  151. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
  152. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
  153. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
  154. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/env_report.py +0 -0
  155. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/__init__.py +0 -0
  156. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/cross_entropy.py +0 -0
  157. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/dyt.py +0 -0
  158. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  159. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
  160. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
  161. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
  162. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
  163. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/geglu.py +0 -0
  164. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/group_norm.py +0 -0
  165. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/grpo_loss.py +0 -0
  166. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/jsd.py +0 -0
  167. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/kl_div.py +0 -0
  168. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/multi_token_attention.py +0 -0
  169. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
  170. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/softmax.py +0 -0
  171. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/sparsemax.py +0 -0
  172. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/swiglu.py +0 -0
  173. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/tvd.py +0 -0
  174. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/utils.py +0 -0
  175. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/auto_model.py +0 -0
  176. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/cross_entropy.py +0 -0
  177. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/dyt.py +0 -0
  178. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  179. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/fsdp.py +0 -0
  180. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
  181. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
  182. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
  183. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/geglu.py +0 -0
  184. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/group_norm.py +0 -0
  185. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/grpo_loss.py +0 -0
  186. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/jsd.py +0 -0
  187. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/kl_div.py +0 -0
  188. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/layer_norm.py +0 -0
  189. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/__init__.py +0 -0
  190. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/gemma.py +0 -0
  191. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/gemma2.py +0 -0
  192. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/glm4.py +0 -0
  193. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/llama.py +0 -0
  194. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/llama4.py +0 -0
  195. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/llava.py +0 -0
  196. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
  197. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/mistral.py +0 -0
  198. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/mixtral.py +0 -0
  199. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/mllama.py +0 -0
  200. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/olmo2.py +0 -0
  201. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/paligemma.py +0 -0
  202. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/phi3.py +0 -0
  203. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/qwen2.py +0 -0
  204. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
  205. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
  206. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/qwen3.py +0 -0
  207. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
  208. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
  209. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
  210. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/rms_norm.py +0 -0
  211. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/rope.py +0 -0
  212. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/softmax.py +0 -0
  213. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/sparsemax.py +0 -0
  214. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/swiglu.py +0 -0
  215. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
  216. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
  217. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  218. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/tvd.py +0 -0
  219. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/triton/__init__.py +0 -0
  220. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/triton/monkey_patch.py +0 -0
  221. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/utils.py +0 -0
  222. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel.egg-info/dependency_links.txt +0 -0
  223. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel.egg-info/requires.txt +0 -0
  224. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel.egg-info/top_level.txt +0 -0
  225. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/__init__.py +0 -0
  226. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/__init__.py +0 -0
  227. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_cosine_loss.py +0 -0
  228. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_cpo_loss.py +0 -0
  229. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_dpo_loss.py +0 -0
  230. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_grpo_loss.py +0 -0
  231. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_jsd_loss.py +0 -0
  232. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_kto_loss.py +0 -0
  233. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_orpo_loss.py +0 -0
  234. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_simpo_loss.py +0 -0
  235. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/conftest.py +0 -0
  236. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/__init__.py +0 -0
  237. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/bf16/__init__.py +0 -0
  238. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/bf16/test_mini_models_multimodal.py +0 -0
  239. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/fp32/__init__.py +0 -0
  240. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
  241. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
  242. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
  243. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
  244. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
  245. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
  246. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
  247. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
  248. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
  249. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
  250. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
  251. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/tiny_shakespeare.txt +0 -0
  252. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
  253. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
  254. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
  255. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_auto_model.py +0 -0
  256. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_cross_entropy.py +0 -0
  257. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_dyt.py +0 -0
  258. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_embedding.py +0 -0
  259. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_flex_attention.py +0 -0
  260. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
  261. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_fused_linear_jsd.py +0 -0
  262. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_fused_neighborhood_attention.py +0 -0
  263. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_geglu.py +0 -0
  264. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_group_norm.py +0 -0
  265. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_grpo_loss.py +0 -0
  266. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_jsd.py +0 -0
  267. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_kl_div.py +0 -0
  268. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_mm_int8int2.py +0 -0
  269. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_multi_token_attention.py +0 -0
  270. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_qwen2vl_mrope.py +0 -0
  271. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_rms_norm.py +0 -0
  272. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_rope.py +0 -0
  273. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_softmax.py +0 -0
  274. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_sparsemax.py +0 -0
  275. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_swiglu.py +0 -0
  276. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_trainer_integration.py +0 -0
  277. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_transformers.py +0 -0
  278. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_tvd.py +0 -0
  279. {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/triton/test_triton_monkey_patch.py +0 -0
@@ -0,0 +1,167 @@
1
+ name: Benchmarks
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ schedule:
7
+ # Runs at 00:00 UTC every Friday
8
+ - cron: '0 0 * * 5'
9
+ workflow_dispatch: # Enables manual trigger
10
+ inputs:
11
+ commit_hash:
12
+ description: 'Commit hash to benchmark'
13
+ default: 'main'
14
+ overwrite:
15
+ description: 'Overwrite existing benchmark data if it exists'
16
+ type: boolean
17
+ default: false
18
+
19
+ permissions:
20
+ contents: write
21
+
22
+ concurrency:
23
+ # This causes it to cancel previous in-progress actions on the same PR / branch,
24
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
25
+ cancel-in-progress: true
26
+
27
+ jobs:
28
+ benchmarks:
29
+ runs-on: ubuntu-latest
30
+ env:
31
+ MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
32
+ MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
33
+ GITHUB_USERNAME: linkedin
34
+ REPO_NAME: Liger-Kernel
35
+ OUTPUT_DIR: benchmarks
36
+ OUTPUT_FILENAME: benchmark.csv
37
+ GENERATED_CSV: benchmark/data/all_benchmark_data.csv
38
+
39
+
40
+ steps:
41
+ # Step: Decide the commit hash to use
42
+ - name: Determine commit hash to checkout
43
+ id: choose_commit
44
+ run: |
45
+ if [ "${{ github.event.inputs.commit_hash }}" != "" ]; then
46
+ echo "Using manual input commit: ${{ github.event.inputs.commit_hash }}"
47
+ echo "hash=${{ github.event.inputs.commit_hash }}" >> $GITHUB_OUTPUT
48
+ else
49
+ echo "Using latest commit from main"
50
+ git fetch origin main
51
+ echo "hash=$(git rev-parse origin/main)" >> $GITHUB_OUTPUT
52
+ fi
53
+
54
+ # Step: Checkout full history so we can check out any commit
55
+ - name: Checkout full repo history
56
+ uses: actions/checkout@v3
57
+ with:
58
+ fetch-depth: 0 # Important: so we can checkout arbitrary commit
59
+ # Step: Conditionally replace benchmark folder from main
60
+ - name: Replace benchmark folder from main (manual only, commit ≠ main)
61
+ if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.commit_hash != 'main' }}
62
+ run: |
63
+ echo "Detected manual trigger with commit_hash = ${{ github.event.inputs.commit_hash }}"
64
+
65
+ # Save current branch (detached HEAD at old commit)
66
+ ORIG_COMMIT=${{ github.event.inputs.commit_hash }}
67
+
68
+ # Fetch and checkout main
69
+ git fetch origin main
70
+ git checkout origin/main -- benchmark/
71
+
72
+ # Save benchmark folder from main
73
+ cp -r benchmark /tmp/benchmark_main
74
+ # Checkout back to target commit
75
+ git checkout $ORIG_COMMIT
76
+ # Replace old benchmark with one from main
77
+ rm -rf benchmark
78
+ cp -r /tmp/benchmark_main benchmark
79
+
80
+ # Step: Check if benchmark exists and exit if overwrite is false
81
+ - name: Check existing benchmark
82
+ run: |
83
+ COMMIT_HASH="${{ steps.choose_commit.outputs.hash }}"
84
+ BENCHMARK_URL="https://raw.githubusercontent.com/linkedin/Liger-Kernel/refs/heads/gh-pages/benchmarks/${COMMIT_HASH}/benchmark.csv"
85
+
86
+ if curl --output /dev/null --silent --head --fail "$BENCHMARK_URL"; then
87
+ echo "Benchmark already exists for commit $COMMIT_HASH"
88
+ if [ "${{ github.event.inputs.overwrite }}" != "true" ]; then
89
+ echo "Overwrite is false - exiting"
90
+ exit 1
91
+ else
92
+ echo "Overwrite is true - proceeding"
93
+ fi
94
+ else
95
+ echo "No existing benchmark found - proceeding"
96
+ fi
97
+
98
+ - name: Set up Python
99
+ uses: actions/setup-python@v3
100
+ with:
101
+ python-version: '3.10'
102
+
103
+ # Install dependencies
104
+ - name: Install dependencies
105
+ run: |
106
+ python -m pip install --upgrade pip
107
+ pip install modal
108
+
109
+ # Delete previous benchmark results.
110
+ - name: Remove previous benchmark data
111
+ run: |
112
+ rm -f benchmark/data/all_benchmark_data.csv
113
+
114
+ - name: Run benchmarks on GPU
115
+ run: |
116
+ modal run dev.modal.benchmarks
117
+
118
+ # Step 5: Checkout gh-pages branch in a subfolderAdd commentMore actions
119
+ - name: Checkout gh-pages
120
+ uses: actions/checkout@v3
121
+ with:
122
+ ref: gh-pages
123
+ path: gh-pages
124
+
125
+ # Step 6: Copy benchmark CSV to gh-pages directory
126
+ - name: Copy generated benchmark to gh-pages
127
+ id: copy_benchmark
128
+ run: |
129
+ if [[ "${{ github.event_name }}" == "release" ]]; then
130
+ echo "Release event detected"
131
+ path=${{steps.choose_commit.outputs.hash}}-${{ github.event.release.tag_name }}
132
+ else
133
+ echo "Not a release event"
134
+ path=${{steps.choose_commit.outputs.hash}}
135
+ fi
136
+ COMMIT_DIR="gh-pages/${OUTPUT_DIR}/${path}"
137
+
138
+ mkdir -p "$COMMIT_DIR"
139
+
140
+ if [ -f "$COMMIT_DIR/${OUTPUT_FILENAME}" ]; then
141
+ echo "Removing existing benchmark.csv for this commit"
142
+ rm "$COMMIT_DIR/${OUTPUT_FILENAME}"
143
+ fi
144
+
145
+ cp "${GENERATED_CSV}" "$COMMIT_DIR/${OUTPUT_FILENAME}"
146
+
147
+ # Step 7: Append commit hash to commits.txt if not already present
148
+ - name: Update commits.txt
149
+ run: |
150
+ cd gh-pages
151
+ echo "commits.txt file path: ${OUTPUT_DIR}/commits.txt"
152
+ # Create file if it doesn't exist
153
+ mkdir -p ${OUTPUT_DIR}
154
+ touch ${OUTPUT_DIR}/commits.txt
155
+
156
+ echo "${{ steps.copy_benchmark.outputs.path }}" >> ${OUTPUT_DIR}/commits.txt
157
+
158
+ echo "Added commit hash to commits.txt"
159
+ # Step 7: Commit and push
160
+ - name: Commit and push to gh-pages
161
+ run: |
162
+ cd gh-pages
163
+ git config user.name github-actions[bot]
164
+ git config user.email 41898282+github-actions[bot]@users.noreply.github.com
165
+ git add .
166
+ git commit -m "Add benchmark for commit ${{ steps.copy_benchmark.outputs.path }}" || echo "No changes to commit"
167
+ git push origin gh-pages
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: liger_kernel
3
- Version: 0.6.0
3
+ Version: 0.6.1
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -84,7 +84,7 @@ Dynamic: requires-dist
84
84
  </td>
85
85
  <td style="padding: 10px;">
86
86
  <a href="https://discord.gg/gpumode">
87
- <img src="https://dcbadge.vercel.app/api/server/gpumode?style=flat" alt="Join Our Discord">
87
+ <img src="https://dcbadge.limes.pink/api/server/gpumode?style=flat" alt="Join Our Discord">
88
88
  </a>
89
89
  </td>
90
90
  </tr>
@@ -307,7 +307,7 @@ loss.backward()
307
307
  | Qwen2-VL, & QVQ | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl` | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
308
308
  | Qwen2.5-VL | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_5_vl` | RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
309
309
  | Qwen3 | `liger_kernel.transformers.apply_liger_kernel_to_qwen3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
310
- | Qwen3 MoE | `liger_kernel_transformers.apply_liger_kernel_to_qwen3_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
310
+ | Qwen3 MoE | `liger_kernel.transformers.apply_liger_kernel_to_qwen3_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
311
311
  | Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
312
312
  | Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
313
313
  | OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -414,21 +414,19 @@ loss.backward()
414
414
 
415
415
  - For issues, create a Github ticket in this repository
416
416
  - For open discussion, join [our discord channel on GPUMode](https://discord.com/channels/1189498204333543425/1275130785933951039)
417
- - For formal collaboration, send an email to yannchen@linkedin.com and hning@linkedin.com
417
+ - For formal collaboration, send an email to Yanning Chen(yannchen@linkedin.com) and Zhipeng Wang(zhipwang@linkedin.com)
418
418
 
419
419
  ## Cite this work
420
420
 
421
421
  Biblatex entry:
422
422
  ```bib
423
- @article{hsu2024ligerkernelefficienttriton,
424
- title={Liger Kernel: Efficient Triton Kernels for LLM Training},
425
- author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},
426
- year={2024},
427
- eprint={2410.10989},
428
- archivePrefix={arXiv},
429
- primaryClass={cs.LG},
430
- url={https://arxiv.org/abs/2410.10989},
431
- journal={arXiv preprint arXiv:2410.10989},
423
+ @inproceedings{
424
+ hsu2025ligerkernel,
425
+ title={Liger-Kernel: Efficient Triton Kernels for {LLM} Training},
426
+ author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen and Zhipeng Wang},
427
+ booktitle={Championing Open-source DEvelopment in ML Workshop @ ICML25},
428
+ year={2025},
429
+ url={https://openreview.net/forum?id=36SjAIT42G}
432
430
  }
433
431
  ```
434
432
 
@@ -32,7 +32,7 @@
32
32
  </td>
33
33
  <td style="padding: 10px;">
34
34
  <a href="https://discord.gg/gpumode">
35
- <img src="https://dcbadge.vercel.app/api/server/gpumode?style=flat" alt="Join Our Discord">
35
+ <img src="https://dcbadge.limes.pink/api/server/gpumode?style=flat" alt="Join Our Discord">
36
36
  </a>
37
37
  </td>
38
38
  </tr>
@@ -255,7 +255,7 @@ loss.backward()
255
255
  | Qwen2-VL, & QVQ | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl` | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
256
256
  | Qwen2.5-VL | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_5_vl` | RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
257
257
  | Qwen3 | `liger_kernel.transformers.apply_liger_kernel_to_qwen3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
258
- | Qwen3 MoE | `liger_kernel_transformers.apply_liger_kernel_to_qwen3_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
258
+ | Qwen3 MoE | `liger_kernel.transformers.apply_liger_kernel_to_qwen3_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
259
259
  | Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
260
260
  | Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
261
261
  | OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -362,21 +362,19 @@ loss.backward()
362
362
 
363
363
  - For issues, create a Github ticket in this repository
364
364
  - For open discussion, join [our discord channel on GPUMode](https://discord.com/channels/1189498204333543425/1275130785933951039)
365
- - For formal collaboration, send an email to yannchen@linkedin.com and hning@linkedin.com
365
+ - For formal collaboration, send an email to Yanning Chen(yannchen@linkedin.com) and Zhipeng Wang(zhipwang@linkedin.com)
366
366
 
367
367
  ## Cite this work
368
368
 
369
369
  Biblatex entry:
370
370
  ```bib
371
- @article{hsu2024ligerkernelefficienttriton,
372
- title={Liger Kernel: Efficient Triton Kernels for LLM Training},
373
- author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},
374
- year={2024},
375
- eprint={2410.10989},
376
- archivePrefix={arXiv},
377
- primaryClass={cs.LG},
378
- url={https://arxiv.org/abs/2410.10989},
379
- journal={arXiv preprint arXiv:2410.10989},
371
+ @inproceedings{
372
+ hsu2025ligerkernel,
373
+ title={Liger-Kernel: Efficient Triton Kernels for {LLM} Training},
374
+ author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen and Zhipeng Wang},
375
+ booktitle={Championing Open-source DEvelopment in ML Workshop @ ICML25},
376
+ year={2025},
377
+ url={https://openreview.net/forum?id=36SjAIT42G}
380
378
  }
381
379
  ```
382
380
 
@@ -625,36 +625,6 @@ group_norm,huggingface,backward,memory,MB,C,num_channels,256,320.5078125,320.507
625
625
  group_norm,huggingface,backward,memory,MB,C,num_channels,512,641.015625,641.015625,641.015625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
626
626
  group_norm,huggingface,backward,memory,MB,C,num_channels,1024,1282.03125,1282.03125,1282.03125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
627
627
  group_norm,huggingface,backward,memory,MB,C,num_channels,2048,2564.0625,2564.0625,2564.0625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
628
- layer_norm,liger,forward,speed,ms,N,hidden size,1024,0.035840000957250595,0.03481600061058998,0.035840000957250595,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
629
- layer_norm,liger,forward,speed,ms,N,hidden size,2048,0.05939200147986412,0.058368001133203506,0.060416001826524734,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
630
- layer_norm,liger,forward,speed,ms,N,hidden size,4096,0.10751999914646149,0.10751999914646149,0.1085439994931221,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
631
- layer_norm,liger,forward,speed,ms,N,hidden size,8192,0.20582400262355804,0.20479999482631683,0.20684799551963806,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
632
- layer_norm,liger,forward,speed,ms,N,hidden size,16384,0.3993600010871887,0.3983359932899475,0.40140798687934875,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
633
- layer_norm,huggingface,forward,speed,ms,N,hidden size,1024,0.03788800165057182,0.03788800165057182,0.03891199827194214,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
634
- layer_norm,huggingface,forward,speed,ms,N,hidden size,2048,0.0655359998345375,0.0655359998345375,0.06656000018119812,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
635
- layer_norm,huggingface,forward,speed,ms,N,hidden size,4096,0.14745600521564484,0.14643199741840363,0.14847999811172485,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
636
- layer_norm,huggingface,forward,speed,ms,N,hidden size,8192,0.31334400177001953,0.3123199939727783,0.31436800956726074,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
637
- layer_norm,huggingface,forward,speed,ms,N,hidden size,16384,0.6133760213851929,0.6123520135879517,0.6154239773750305,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
638
- layer_norm,liger,full,speed,ms,N,hidden size,1024,0.6860799789428711,0.6146048903465271,0.7049216032028198,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
639
- layer_norm,liger,full,speed,ms,N,hidden size,2048,0.6789119839668274,0.6737920045852661,0.6912000179290771,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
640
- layer_norm,liger,full,speed,ms,N,hidden size,4096,0.6686720252037048,0.6635519862174988,0.681984007358551,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
641
- layer_norm,liger,full,speed,ms,N,hidden size,8192,0.6789119839668274,0.5908480286598206,0.6932479739189148,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
642
- layer_norm,liger,full,speed,ms,N,hidden size,16384,6.071296215057373,5.331148624420166,6.08235502243042,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
643
- layer_norm,huggingface,full,speed,ms,N,hidden size,1024,0.13312000036239624,0.13209599256515503,0.13312000036239624,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
644
- layer_norm,huggingface,full,speed,ms,N,hidden size,2048,0.23244799673557281,0.2303999960422516,0.23347200453281403,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
645
- layer_norm,huggingface,full,speed,ms,N,hidden size,4096,0.5242879986763,0.5232639908790588,0.5263360142707825,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
646
- layer_norm,huggingface,full,speed,ms,N,hidden size,8192,1.0168319940567017,1.0147839784622192,1.018880009651184,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
647
- layer_norm,huggingface,full,speed,ms,N,hidden size,16384,1.994752049446106,1.9916800260543823,1.9967999458312988,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
648
- layer_norm,liger,full,memory,MB,N,hidden size,1024,80.90625,80.90625,80.90625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
649
- layer_norm,liger,full,memory,MB,N,hidden size,2048,161.78125,161.78125,161.78125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
650
- layer_norm,liger,full,memory,MB,N,hidden size,4096,323.53125,323.53125,323.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
651
- layer_norm,liger,full,memory,MB,N,hidden size,8192,647.03125,647.03125,647.03125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
652
- layer_norm,liger,full,memory,MB,N,hidden size,16384,1294.03125,1294.03125,1294.03125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
653
- layer_norm,huggingface,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
654
- layer_norm,huggingface,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
655
- layer_norm,huggingface,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
656
- layer_norm,huggingface,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
657
- layer_norm,huggingface,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
658
628
  fused_linear_orpo_loss,liger,forward,speed,ms,B,B,2,116.00621032714844,116.00621032714844,116.00621032714844,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
659
629
  fused_linear_orpo_loss,liger,forward,speed,ms,B,B,4,230.83609008789062,230.83609008789062,230.83609008789062,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
660
630
  fused_linear_orpo_loss,liger,forward,speed,ms,B,B,8,461.9543151855469,461.9543151855469,461.9543151855469,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
@@ -1493,3 +1463,115 @@ distill_cosine_loss,torch,full,memory,MB,BT,B x T,1024,7566.2822265625,7566.2822
1493
1463
  distill_cosine_loss,torch,full,memory,MB,BT,B x T,2048,11590.3134765625,11590.3134765625,11590.3134765625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
1494
1464
  distill_cosine_loss,torch,full,memory,MB,BT,B x T,4096,19654.375,19654.375,19654.375,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
1495
1465
  distill_cosine_loss,torch,full,memory,MB,BT,B x T,8192,35782.5,35782.5,35782.5,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
1466
+ layer_norm,liger,forward,speed,ms,N,hidden size,1024,0.018848000094294548,0.018400000408291817,0.020102400332689285,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1467
+ layer_norm,liger,forward,speed,ms,N,hidden size,2048,0.029152000322937965,0.02876799926161766,0.029823999851942062,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1468
+ layer_norm,liger,forward,speed,ms,N,hidden size,4096,0.05104000121355057,0.05036799982190132,0.05177599936723709,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1469
+ layer_norm,liger,forward,speed,ms,N,hidden size,8192,0.0947519987821579,0.09436800330877304,0.09507200121879578,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1470
+ layer_norm,liger,forward,speed,ms,N,hidden size,16384,0.18476800620555878,0.18396799266338348,0.1852159947156906,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1471
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,1024,0.023584000766277313,0.023423999547958374,0.023840000852942467,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1472
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,2048,0.03734400123357773,0.03702399879693985,0.037811201065778746,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1473
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,4096,0.06617599725723267,0.06560000032186508,0.06678400188684464,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1474
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,8192,0.15267199277877808,0.15190400183200836,0.15347200632095337,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1475
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,16384,0.3067840039730072,0.3046143889427185,0.3081152021884918,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1476
+ layer_norm,liger,backward,speed,ms,N,hidden size,1024,0.12006399780511856,0.11653760075569153,0.12467200309038162,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1477
+ layer_norm,liger,backward,speed,ms,N,hidden size,2048,0.1207360029220581,0.1176128014922142,0.1256511986255646,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1478
+ layer_norm,liger,backward,speed,ms,N,hidden size,4096,0.16630400717258453,0.16412800550460815,0.16838400065898895,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1479
+ layer_norm,liger,backward,speed,ms,N,hidden size,8192,0.31279999017715454,0.31116798520088196,0.3145279884338379,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1480
+ layer_norm,liger,backward,speed,ms,N,hidden size,16384,0.5776320099830627,0.5753471970558167,0.5798912048339844,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1481
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,1024,0.0605119988322258,0.059647999703884125,0.061344001442193985,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1482
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,2048,0.09967999905347824,0.09849599748849869,0.10099200159311295,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1483
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,4096,0.17881600558757782,0.17795200645923615,0.17971199750900269,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1484
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,8192,0.33369600772857666,0.3328000009059906,0.33478400111198425,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1485
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,16384,0.6424000263214111,0.6412223815917969,0.643455982208252,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1486
+ layer_norm,liger,full,speed,ms,N,hidden size,1024,0.26576000452041626,0.2629248082637787,0.2701759934425354,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1487
+ layer_norm,liger,full,speed,ms,N,hidden size,2048,0.27427199482917786,0.26999040842056277,0.28091518878936766,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1488
+ layer_norm,liger,full,speed,ms,N,hidden size,4096,0.27454400062561035,0.27004799246788025,0.2807359993457794,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1489
+ layer_norm,liger,full,speed,ms,N,hidden size,8192,0.40556800365448,0.40403199195861816,0.40723198652267456,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1490
+ layer_norm,liger,full,speed,ms,N,hidden size,16384,0.7608960270881653,0.7589311957359314,0.7631679773330688,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1491
+ layer_norm,huggingface,full,speed,ms,N,hidden size,1024,0.08025600016117096,0.07942400127649307,0.08111999928951263,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1492
+ layer_norm,huggingface,full,speed,ms,N,hidden size,2048,0.13315199315547943,0.13180799782276154,0.13468800485134125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1493
+ layer_norm,huggingface,full,speed,ms,N,hidden size,4096,0.2417600005865097,0.24089600145816803,0.24262399971485138,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1494
+ layer_norm,huggingface,full,speed,ms,N,hidden size,8192,0.4832639992237091,0.48214399814605713,0.4843647956848145,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1495
+ layer_norm,huggingface,full,speed,ms,N,hidden size,16384,0.950575977563858,0.9484800100326538,0.9528064012527466,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1496
+ layer_norm,liger,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1497
+ layer_norm,liger,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1498
+ layer_norm,liger,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1499
+ layer_norm,liger,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1500
+ layer_norm,liger,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1501
+ layer_norm,huggingface,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1502
+ layer_norm,huggingface,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1503
+ layer_norm,huggingface,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1504
+ layer_norm,huggingface,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1505
+ layer_norm,huggingface,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1506
+ fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,1024,0.01759999990463257,0.017311999574303627,0.017920000478625298,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
1507
+ fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,2048,0.02924799919128418,0.028863999992609024,0.029983999207615852,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
1508
+ fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,4096,0.05129599943757057,0.050624001771211624,0.05209600180387497,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
1509
+ fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,8192,0.09344000369310379,0.09296000003814697,0.09382399916648865,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
1510
+ fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,16384,0.1791680008172989,0.17814399302005768,0.1796800047159195,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
1511
+ fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,32768,0.43830400705337524,0.43744000792503357,0.43929600715637207,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
1512
+ fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,1024,0.060095999389886856,0.059808000922203064,0.06054399907588959,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
1513
+ fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,2048,0.09084799885749817,0.09027200192213058,0.09161599725484848,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
1514
+ fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,4096,0.17820799350738525,0.17744000256061554,0.17897599935531616,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
1515
+ fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,8192,0.312608003616333,0.3118720054626465,0.31324800848960876,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
1516
+ fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,16384,0.574944019317627,0.5740479826927185,0.5756288051605225,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
1517
+ fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,32768,1.0943039655685425,1.0934272289276123,1.0951999425888062,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
1518
+ fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,1024,0.0352960005402565,0.03481600061058998,0.03811199963092804,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
1519
+ fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,2048,0.05430399999022484,0.05392000079154968,0.05503999814391136,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
1520
+ fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,4096,0.10592000186443329,0.1054655984044075,0.10630399733781815,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
1521
+ fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,8192,0.19679999351501465,0.19631999731063843,0.19724799692630768,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
1522
+ fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,16384,0.37436801195144653,0.3733760118484497,0.3752320110797882,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
1523
+ fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,32768,0.7376000285148621,0.7361343741416931,0.7391359806060791,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
1524
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,1024,0.3147200047969818,0.30796160697937014,0.32764801383018494,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
1525
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,2048,0.3089919984340668,0.30374398827552795,0.3226880133152008,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
1526
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,4096,0.30691200494766235,0.3023296058177948,0.3205504059791565,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
1527
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,8192,0.3246079981327057,0.3185984075069428,0.33656961321830753,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
1528
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,16384,0.6010559797286987,0.5996800065040588,0.6026239991188049,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
1529
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,32768,1.8402559757232666,1.8322880268096924,1.8461120128631592,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
1530
+ fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,1024,0.23878400027751923,0.23545600473880768,0.2507520020008087,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
1531
+ fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,2048,0.34513600170612335,0.34377598762512207,0.34678399562835693,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
1532
+ fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,4096,0.6330879926681519,0.631712019443512,0.6345599889755249,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
1533
+ fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,8192,1.1185599565505981,1.1172800064086914,1.1196800470352173,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
1534
+ fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,16384,2.0697600841522217,2.0678528785705566,2.0713536739349365,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
1535
+ fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,32768,3.9561920166015625,3.953824043273926,3.9581120014190674,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
1536
+ fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,1024,0.38916800916194916,0.3824320137500763,0.4037184059619903,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
1537
+ fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,2048,0.3890720009803772,0.38193280100822447,0.4032831907272339,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
1538
+ fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,4096,0.39715200662612915,0.3928639888763428,0.41097599267959595,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
1539
+ fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,8192,0.6275200247764587,0.6259520053863525,0.6287999749183655,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
1540
+ fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,16384,1.202239990234375,1.199679970741272,1.2048959732055664,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
1541
+ fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,32768,2.7738559246063232,2.7705343723297116,2.777868890762329,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
1542
+ fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,1024,0.15619200468063354,0.15376000106334686,0.1661248028278351,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
1543
+ fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,2048,0.15825600177049637,0.15600000321865082,0.16911999881267548,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
1544
+ fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,4096,0.16700799763202667,0.16502399742603302,0.1709440052509308,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
1545
+ fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,8192,0.1712000072002411,0.1700800061225891,0.17215999960899353,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
1546
+ fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,16384,0.42505601048469543,0.4233280122280121,0.42691200971603394,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
1547
+ fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,32768,1.4057759642601013,1.3944000005722046,1.4099839925765991,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
1548
+ fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,1024,0.1520960032939911,0.15136000514030457,0.1528960019350052,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
1549
+ fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,2048,0.2533760070800781,0.2524160146713257,0.25436800718307495,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
1550
+ fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,4096,0.4551039934158325,0.4540799856185913,0.45612800121307373,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
1551
+ fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,8192,0.8053439855575562,0.8038079738616943,0.806656002998352,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
1552
+ fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,16384,1.4933120012283325,1.492095947265625,1.49452805519104,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
1553
+ fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,32768,2.8600640296936035,2.8583295822143557,2.8612607955932616,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
1554
+ fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,1024,0.20175999402999878,0.199072003364563,0.2154303938150406,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1555
+ fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,2048,0.20263999700546265,0.20000000298023224,0.21675519943237304,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1556
+ fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,4096,0.25276800990104675,0.2515519857406616,0.2539199888706207,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1557
+ fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,8192,0.4322720021009445,0.43088001012802124,0.4336000084877014,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1558
+ fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,16384,0.8288000226020813,0.8266303777694701,0.8311295866966247,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1559
+ fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,32768,2.03987193107605,2.0360767364501955,2.0436416149139403,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1560
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,1024,72.546875,72.546875,72.546875,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1561
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,2048,145.0859375,145.0859375,145.0859375,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1562
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,4096,290.1640625,290.1640625,290.1640625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1563
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,8192,580.3203125,580.3203125,580.3203125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1564
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,16384,1160.6328125,1160.6328125,1160.6328125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1565
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,32768,2321.2578125,2321.2578125,2321.2578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1566
+ fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,1024,104.03173828125,104.03173828125,104.03173828125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1567
+ fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,2048,208.05517578125,208.05517578125,208.05517578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1568
+ fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,4096,416.10205078125,416.10205078125,416.10205078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1569
+ fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,8192,832.19580078125,832.19580078125,832.19580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1570
+ fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,16384,1664.3125,1664.3125,1664.3125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1571
+ fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,32768,3328.625,3328.625,3328.625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1572
+ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,1024,104.03564453125,104.03564453125,104.03564453125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1573
+ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,2048,208.06298828125,208.06298828125,208.06298828125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1574
+ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,4096,416.11767578125,416.11767578125,416.11767578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1575
+ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,8192,832.22705078125,832.22705078125,832.22705078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1576
+ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,16384,1544.44580078125,1544.44580078125,1544.44580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1577
+ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
@@ -36,17 +36,20 @@ def bench_memory_fused_linear_cpo_loss(
36
36
  dtype = input.extra_benchmark_config["dtype"]
37
37
  provider = input.kernel_provider
38
38
 
39
- torch_lm_head_cpo = lambda x, target: TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
40
- liger_lm_head_cpo = lambda x, target: LigerLMHeadCPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
39
+ # Instantiate once and retrieve the first output only
40
+ torch_lm_head_cpo = TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
41
+ liger_lm_head_cpo = LigerLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
42
+ torch_fwd = lambda x, target: torch_lm_head_cpo(x, target)[0]
43
+ liger_fwd = lambda x, target: liger_lm_head_cpo(x, target)[0]
41
44
 
42
45
  _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
43
46
  target = torch.randint(V, (B, T), dtype=torch.long, device=device)
44
47
 
45
48
  def fwd():
46
49
  if provider == "liger":
47
- return liger_lm_head_cpo(_input, target)
50
+ return liger_fwd(_input, target)
48
51
  elif provider == "huggingface":
49
- return torch_lm_head_cpo(_input, target)
52
+ return torch_fwd(_input, target)
50
53
 
51
54
  def full():
52
55
  y = fwd()
@@ -79,17 +82,20 @@ def bench_speed_fused_linear_cpo_loss(
79
82
  provider = input.kernel_provider
80
83
  mode = input.kernel_operation_mode
81
84
 
82
- torch_lm_head_cpo = lambda x, target: TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
83
- liger_lm_head_cpo = lambda x, target: LigerLMHeadCPO(H=H, V=V, dtype=dtype).to(device)(x, target)[0]
85
+ # Instantiate once and retrieve the first output only
86
+ torch_lm_head_cpo = TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
87
+ liger_lm_head_cpo = LigerLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
88
+ torch_fwd = lambda x, target: torch_lm_head_cpo(x, target)[0]
89
+ liger_fwd = lambda x, target: liger_lm_head_cpo(x, target)[0]
84
90
 
85
91
  _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
86
92
  target = torch.randint(V, (B, T), dtype=torch.long, device=device)
87
93
 
88
94
  def fwd():
89
95
  if provider == "liger":
90
- return liger_lm_head_cpo(_input, target)
96
+ return liger_fwd(_input, target)
91
97
  elif provider == "huggingface":
92
- return torch_lm_head_cpo(_input, target)
98
+ return torch_fwd(_input, target)
93
99
 
94
100
  if mode == "forward":
95
101
  ms_50, ms_20, ms_80 = triton.testing.do_bench(
@@ -32,12 +32,11 @@ def bench_memory_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunO
32
32
  ignore_index = input.extra_benchmark_config["ignore_index"]
33
33
  provider = input.kernel_provider
34
34
 
35
- torch_dpo_loss = lambda x, ref_x, target: TorchLMHeadDPO(
36
- H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias
37
- ).to(device)(x, ref_x, target)[0]
38
- liger_dpo_loss = lambda x, ref_x, target: LigerLMHeadDPO(
39
- H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias
40
- ).to(device)(x, ref_x, target)[0]
35
+ # Instantiate once and retrieve the first output only
36
+ torch_dpo_loss = TorchLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
37
+ liger_dpo_loss = LigerLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
38
+ torch_fwd = lambda x, ref_x, target: torch_dpo_loss(x, ref_x, target)[0]
39
+ liger_fwd = lambda x, ref_x, target: liger_dpo_loss(x, ref_x, target)[0]
41
40
 
42
41
  # Input shape: [B, T, H]
43
42
  _input = torch.randn(B, T, H, device=device, dtype=dtype)
@@ -52,9 +51,9 @@ def bench_memory_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunO
52
51
 
53
52
  def fwd():
54
53
  if provider == "liger":
55
- return liger_dpo_loss(_input, ref_input, target)
54
+ return liger_fwd(_input, ref_input, target)
56
55
  elif provider == "huggingface":
57
- return torch_dpo_loss(_input, ref_input, target)
56
+ return torch_fwd(_input, ref_input, target)
58
57
 
59
58
  def full():
60
59
  y = fwd()
@@ -83,12 +82,11 @@ def bench_speed_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOu
83
82
  provider = input.kernel_provider
84
83
  mode = input.kernel_operation_mode
85
84
 
86
- torch_dpo_loss = lambda x, ref_x, target: TorchLMHeadDPO(
87
- H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias
88
- ).to(device)(x, ref_x, target)[0]
89
- liger_dpo_loss = lambda x, ref_x, target: LigerLMHeadDPO(
90
- H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias
91
- ).to(device)(x, ref_x, target)[0]
85
+ # Instantiate once and retrieve the first output only
86
+ torch_dpo_loss = TorchLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
87
+ liger_dpo_loss = LigerLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
88
+ torch_fwd = lambda x, ref_x, target: torch_dpo_loss(x, ref_x, target)[0]
89
+ liger_fwd = lambda x, ref_x, target: liger_dpo_loss(x, ref_x, target)[0]
92
90
 
93
91
  # Input shape: [B, T, H]
94
92
  _input = torch.randn(B, T, H, device=device, dtype=dtype)
@@ -103,9 +101,9 @@ def bench_speed_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOu
103
101
 
104
102
  def fwd():
105
103
  if provider == "liger":
106
- return liger_dpo_loss(_input, ref_input, target)
104
+ return liger_fwd(_input, ref_input, target)
107
105
  elif provider == "huggingface":
108
- return torch_dpo_loss(_input, ref_input, target)
106
+ return torch_fwd(_input, ref_input, target)
109
107
 
110
108
  if mode == "forward":
111
109
  ms_50, ms_20, ms_80 = triton.testing.do_bench(
@@ -48,6 +48,14 @@ def bench_speed_embedding(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunO
48
48
 
49
49
  if mode == "forward":
50
50
  ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, quantiles=QUANTILES, rep=100)
51
+ elif mode == "backward":
52
+ output = fwd()
53
+ ms_50, ms_20, ms_80 = triton.testing.do_bench(
54
+ lambda: output.backward(torch.randn_like(output), retain_graph=True),
55
+ quantiles=QUANTILES,
56
+ grad_to_none=[input_ids],
57
+ rep=100,
58
+ )
51
59
  elif mode == "full":
52
60
  ms_50, ms_20, ms_80 = triton.testing.do_bench(full, quantiles=QUANTILES, rep=100)
53
61
  return SingleBenchmarkRunOutput(