liger-kernel 0.6.0__tar.gz → 0.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. liger_kernel-0.6.2/.github/workflows/benchmark.yml +168 -0
  2. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/PKG-INFO +12 -14
  3. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/README.md +11 -13
  4. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/data/all_benchmark_data.csv +176 -30
  5. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_cpo_loss.py +14 -8
  6. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_dpo_loss.py +14 -16
  7. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_embedding.py +8 -0
  8. liger_kernel-0.6.2/benchmark/scripts/benchmark_fused_add_rms_norm.py +201 -0
  9. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +11 -3
  10. liger_kernel-0.6.2/benchmark/scripts/benchmark_llama4_rope.py +249 -0
  11. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_orpo_loss.py +14 -8
  12. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_simpo_loss.py +14 -8
  13. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/dev/modal/benchmarks.py +1 -1
  14. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/index.md +8 -10
  15. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/pyproject.toml +1 -1
  16. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/dpo_loss.py +54 -3
  17. liger_kernel-0.6.2/src/liger_kernel/ops/fused_add_rms_norm.py +412 -0
  18. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/fused_linear_cross_entropy.py +21 -13
  19. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/layer_norm.py +126 -89
  20. liger_kernel-0.6.2/src/liger_kernel/ops/llama4_rope.py +225 -0
  21. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/rms_norm.py +2 -2
  22. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/rope.py +1 -1
  23. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/__init__.py +20 -0
  24. liger_kernel-0.6.2/src/liger_kernel/transformers/experimental/__init__.py +5 -0
  25. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/functional.py +7 -0
  26. liger_kernel-0.6.2/src/liger_kernel/transformers/fused_add_rms_norm.py +39 -0
  27. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +3 -0
  28. liger_kernel-0.6.2/src/liger_kernel/transformers/llama4_rope.py +93 -0
  29. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/gemma3.py +1 -1
  30. liger_kernel-0.6.2/src/liger_kernel/transformers/model/glm4v.py +150 -0
  31. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/loss_utils.py +2 -0
  32. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/mllama.py +4 -2
  33. liger_kernel-0.6.2/src/liger_kernel/transformers/model/phi3.py +112 -0
  34. liger_kernel-0.6.2/src/liger_kernel/transformers/model/smollm3.py +189 -0
  35. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/monkey_patch.py +185 -32
  36. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel.egg-info/PKG-INFO +12 -14
  37. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel.egg-info/SOURCES.txt +10 -0
  38. liger_kernel-0.6.2/test/chunked_loss/test_dpo_loss.py +938 -0
  39. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/convergence/bf16/test_mini_models.py +163 -2
  40. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/convergence/bf16/test_mini_models_multimodal.py +13 -3
  41. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/convergence/bf16/test_mini_models_with_logits.py +160 -0
  42. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/convergence/fp32/test_mini_models.py +155 -1
  43. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/convergence/fp32/test_mini_models_multimodal.py +2 -1
  44. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/convergence/fp32/test_mini_models_with_logits.py +155 -0
  45. liger_kernel-0.6.2/test/transformers/test_fused_add_rms_norm.py +219 -0
  46. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_fused_linear_cross_entropy.py +12 -5
  47. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_layer_norm.py +3 -0
  48. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_monkey_patch.py +148 -5
  49. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/utils.py +24 -0
  50. liger_kernel-0.6.0/.github/workflows/benchmark.yml +0 -93
  51. liger_kernel-0.6.0/src/liger_kernel/transformers/model/phi3.py +0 -263
  52. liger_kernel-0.6.0/test/chunked_loss/test_dpo_loss.py +0 -358
  53. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  54. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  55. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/.github/pull_request_template.md +0 -0
  56. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/.github/workflows/amd-ci.yml +0 -0
  57. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/.github/workflows/docs.yml +0 -0
  58. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/.github/workflows/intel-ci.yml +0 -0
  59. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/.github/workflows/nvi-ci.yml +0 -0
  60. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/.github/workflows/publish-nightly.yml +0 -0
  61. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/.github/workflows/publish-release.yml +0 -0
  62. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/.gitignore +0 -0
  63. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/LICENSE +0 -0
  64. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/Makefile +0 -0
  65. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/NOTICE +0 -0
  66. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/README.md +0 -0
  67. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/__init__.py +0 -0
  68. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/benchmarks_visualizer.py +0 -0
  69. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/__init__.py +0 -0
  70. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
  71. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
  72. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
  73. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_dyt.py +0 -0
  74. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
  75. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
  76. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_geglu.py +0 -0
  77. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_group_norm.py +0 -0
  78. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_jsd.py +0 -0
  79. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_kl_div.py +0 -0
  80. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_kto_loss.py +0 -0
  81. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_layer_norm.py +0 -0
  82. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
  83. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
  84. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_rms_norm.py +0 -0
  85. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_rope.py +0 -0
  86. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_softmax.py +0 -0
  87. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
  88. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_sparsemax.py +0 -0
  89. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_swiglu.py +0 -0
  90. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/benchmark_tvd.py +0 -0
  91. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/benchmark/scripts/utils.py +0 -0
  92. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/dev/fmt-requirements.txt +0 -0
  93. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/dev/modal/tests.py +0 -0
  94. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/dev/modal/tests_bwd.py +0 -0
  95. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/Examples.md +0 -0
  96. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/Getting-Started.md +0 -0
  97. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/High-Level-APIs.md +0 -0
  98. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/Low-Level-APIs.md +0 -0
  99. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/acknowledgement.md +0 -0
  100. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/contributing.md +0 -0
  101. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/images/banner.GIF +0 -0
  102. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/images/compose.gif +0 -0
  103. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/images/e2e-memory.png +0 -0
  104. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/images/e2e-tps.png +0 -0
  105. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/images/logo-banner.png +0 -0
  106. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/images/patch.gif +0 -0
  107. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/images/post-training.png +0 -0
  108. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/docs/license.md +0 -0
  109. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/alignment/accelerate_config.yaml +0 -0
  110. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/alignment/run_orpo.py +0 -0
  111. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/README.md +0 -0
  112. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/callback.py +0 -0
  113. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/config/fsdp_config.json +0 -0
  114. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/img/gemma_7b_mem.png +0 -0
  115. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/img/gemma_7b_tp.png +0 -0
  116. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/img/llama_mem_alloc.png +0 -0
  117. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/img/llama_tps.png +0 -0
  118. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
  119. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/img/qwen_tps.png +0 -0
  120. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/launch_on_modal.py +0 -0
  121. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/requirements.txt +0 -0
  122. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/run_benchmarks.sh +0 -0
  123. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/run_gemma.sh +0 -0
  124. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/run_llama.sh +0 -0
  125. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/run_qwen.sh +0 -0
  126. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/run_qwen2_vl.sh +0 -0
  127. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/training.py +0 -0
  128. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/huggingface/training_multimodal.py +0 -0
  129. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/lightning/README.md +0 -0
  130. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/lightning/requirements.txt +0 -0
  131. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/lightning/training.py +0 -0
  132. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/README.md +0 -0
  133. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/callback.py +0 -0
  134. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
  135. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
  136. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
  137. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
  138. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
  139. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
  140. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
  141. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
  142. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
  143. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/medusa_util.py +0 -0
  144. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/requirements.txt +0 -0
  145. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
  146. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/examples/medusa/train.py +0 -0
  147. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/licenses/LICENSE-Apache-2.0 +0 -0
  148. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/licenses/LICENSE-MIT-AutoAWQ +0 -0
  149. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
  150. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/licenses/LICENSE-MIT-llmc +0 -0
  151. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/licenses/LICENSE-MIT-triton +0 -0
  152. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/mkdocs.yml +0 -0
  153. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/setup.cfg +0 -0
  154. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/setup.py +0 -0
  155. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/__init__.py +0 -0
  156. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/README.md +0 -0
  157. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/__init__.py +0 -0
  158. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
  159. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
  160. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/functional.py +0 -0
  161. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
  162. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
  163. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
  164. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
  165. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
  166. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
  167. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
  168. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
  169. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
  170. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/env_report.py +0 -0
  171. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/__init__.py +0 -0
  172. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/cross_entropy.py +0 -0
  173. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/dyt.py +0 -0
  174. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  175. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
  176. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
  177. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
  178. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/geglu.py +0 -0
  179. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/group_norm.py +0 -0
  180. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/grpo_loss.py +0 -0
  181. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/jsd.py +0 -0
  182. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/kl_div.py +0 -0
  183. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/multi_token_attention.py +0 -0
  184. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
  185. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/softmax.py +0 -0
  186. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/sparsemax.py +0 -0
  187. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/swiglu.py +0 -0
  188. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/tvd.py +0 -0
  189. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/ops/utils.py +0 -0
  190. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/auto_model.py +0 -0
  191. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/cross_entropy.py +0 -0
  192. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/dyt.py +0 -0
  193. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  194. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/fsdp.py +0 -0
  195. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
  196. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
  197. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/geglu.py +0 -0
  198. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/group_norm.py +0 -0
  199. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/grpo_loss.py +0 -0
  200. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/jsd.py +0 -0
  201. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/kl_div.py +0 -0
  202. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/layer_norm.py +0 -0
  203. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/__init__.py +0 -0
  204. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/gemma.py +0 -0
  205. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/gemma2.py +0 -0
  206. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/glm4.py +0 -0
  207. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/llama.py +0 -0
  208. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/llama4.py +0 -0
  209. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/llava.py +0 -0
  210. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/mistral.py +0 -0
  211. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/mixtral.py +0 -0
  212. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/olmo2.py +0 -0
  213. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/paligemma.py +0 -0
  214. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/qwen2.py +0 -0
  215. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
  216. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
  217. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/qwen3.py +0 -0
  218. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
  219. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
  220. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
  221. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/rms_norm.py +0 -0
  222. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/rope.py +0 -0
  223. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/softmax.py +0 -0
  224. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/sparsemax.py +0 -0
  225. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/swiglu.py +0 -0
  226. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
  227. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
  228. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  229. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/transformers/tvd.py +0 -0
  230. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/triton/__init__.py +0 -0
  231. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/triton/monkey_patch.py +0 -0
  232. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel/utils.py +0 -0
  233. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel.egg-info/dependency_links.txt +0 -0
  234. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel.egg-info/requires.txt +0 -0
  235. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/src/liger_kernel.egg-info/top_level.txt +0 -0
  236. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/__init__.py +0 -0
  237. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/chunked_loss/__init__.py +0 -0
  238. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/chunked_loss/test_cosine_loss.py +0 -0
  239. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/chunked_loss/test_cpo_loss.py +0 -0
  240. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/chunked_loss/test_grpo_loss.py +0 -0
  241. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/chunked_loss/test_jsd_loss.py +0 -0
  242. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/chunked_loss/test_kto_loss.py +0 -0
  243. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/chunked_loss/test_orpo_loss.py +0 -0
  244. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/chunked_loss/test_simpo_loss.py +0 -0
  245. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/conftest.py +0 -0
  246. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/convergence/__init__.py +0 -0
  247. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/convergence/bf16/__init__.py +0 -0
  248. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/convergence/fp32/__init__.py +0 -0
  249. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
  250. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
  251. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
  252. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
  253. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
  254. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
  255. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
  256. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
  257. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
  258. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
  259. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/resources/tiny_shakespeare.txt +0 -0
  260. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
  261. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
  262. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
  263. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_auto_model.py +0 -0
  264. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_cross_entropy.py +0 -0
  265. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_dyt.py +0 -0
  266. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_embedding.py +0 -0
  267. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_flex_attention.py +0 -0
  268. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_fused_linear_jsd.py +0 -0
  269. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_fused_neighborhood_attention.py +0 -0
  270. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_geglu.py +0 -0
  271. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_group_norm.py +0 -0
  272. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_grpo_loss.py +0 -0
  273. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_jsd.py +0 -0
  274. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_kl_div.py +0 -0
  275. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_mm_int8int2.py +0 -0
  276. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_multi_token_attention.py +0 -0
  277. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_qwen2vl_mrope.py +0 -0
  278. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_rms_norm.py +0 -0
  279. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_rope.py +0 -0
  280. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_softmax.py +0 -0
  281. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_sparsemax.py +0 -0
  282. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_swiglu.py +0 -0
  283. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_trainer_integration.py +0 -0
  284. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_transformers.py +0 -0
  285. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/transformers/test_tvd.py +0 -0
  286. {liger_kernel-0.6.0 → liger_kernel-0.6.2}/test/triton/test_triton_monkey_patch.py +0 -0
@@ -0,0 +1,168 @@
1
+ name: Benchmarks
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ schedule:
7
+ # Runs at 00:00 UTC every Friday
8
+ - cron: '0 0 * * 5'
9
+ workflow_dispatch: # Enables manual trigger
10
+ inputs:
11
+ commit_hash:
12
+ description: 'Commit hash to benchmark'
13
+ default: 'main'
14
+ overwrite:
15
+ description: 'Overwrite existing benchmark data if it exists'
16
+ type: boolean
17
+ default: false
18
+
19
+ permissions:
20
+ contents: write
21
+
22
+ concurrency:
23
+ # This causes it to cancel previous in-progress actions on the same PR / branch,
24
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
25
+ cancel-in-progress: true
26
+
27
+ jobs:
28
+ benchmarks:
29
+ runs-on: ubuntu-latest
30
+ env:
31
+ MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
32
+ MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
33
+ GITHUB_USERNAME: linkedin
34
+ REPO_NAME: Liger-Kernel
35
+ OUTPUT_DIR: benchmarks
36
+ OUTPUT_FILENAME: benchmark.csv
37
+ GENERATED_CSV: benchmark/data/all_benchmark_data.csv
38
+
39
+
40
+ steps:
41
+ # Step: Decide the commit hash to use
42
+ # Step: Checkout full history so we can check out any commit
43
+ - name: Checkout full repo history
44
+ uses: actions/checkout@v3
45
+ with:
46
+ fetch-depth: 0 # Important: so we can checkout arbitrary commit
47
+
48
+ - name: Determine commit hash to checkout
49
+ id: choose_commit
50
+ run: |
51
+ if [ "${{ github.event_name}}" == "workflow_dispatch" ] && [ "${{ github.event.inputs.commit_hash }}" != "main" ]; then
52
+ echo "Using manual input commit: ${{ github.event.inputs.commit_hash }}"
53
+ echo "hash=${{ github.event.inputs.commit_hash }}" >> $GITHUB_OUTPUT
54
+ else
55
+ echo "Using latest commit from main"
56
+ echo "hash=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
57
+ fi
58
+
59
+ # Step: Conditionally replace benchmark folder from main
60
+ - name: Replace benchmark folder from main (manual only, commit ≠ main)
61
+ if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.commit_hash != 'main' }}
62
+ run: |
63
+ echo "Detected manual trigger with commit_hash = ${{ github.event.inputs.commit_hash }}"
64
+
65
+ # Save current branch (detached HEAD at old commit)
66
+ ORIG_COMMIT=${{ github.event.inputs.commit_hash }}
67
+
68
+ # Fetch and checkout main
69
+ git fetch origin main
70
+ git checkout origin/main -- benchmark/
71
+
72
+ # Save benchmark folder from main
73
+ cp -r benchmark /tmp/benchmark_main
74
+ # Checkout back to target commit
75
+ git checkout $ORIG_COMMIT
76
+ # Replace old benchmark with one from main
77
+ rm -rf benchmark
78
+ cp -r /tmp/benchmark_main benchmark
79
+
80
+ # Step: Check if benchmark exists and exit if overwrite is false
81
+ - name: Check existing benchmark
82
+ run: |
83
+ COMMIT_HASH="${{ steps.choose_commit.outputs.hash }}"
84
+ BENCHMARK_URL="https://raw.githubusercontent.com/linkedin/Liger-Kernel/refs/heads/gh-pages/benchmarks/${COMMIT_HASH}/benchmark.csv"
85
+
86
+ if curl --output /dev/null --silent --head --fail "$BENCHMARK_URL"; then
87
+ echo "Benchmark already exists for commit $COMMIT_HASH"
88
+ if [ "${{ github.event.inputs.overwrite }}" != "true" ]; then
89
+ echo "Overwrite is false - exiting"
90
+ exit 1
91
+ else
92
+ echo "Overwrite is true - proceeding"
93
+ fi
94
+ else
95
+ echo "No existing benchmark found - proceeding"
96
+ fi
97
+
98
+ - name: Set up Python
99
+ uses: actions/setup-python@v3
100
+ with:
101
+ python-version: '3.10'
102
+
103
+ # Install dependencies
104
+ - name: Install dependencies
105
+ run: |
106
+ python -m pip install --upgrade pip
107
+ pip install modal
108
+
109
+ # Delete previous benchmark results.
110
+ - name: Remove previous benchmark data
111
+ run: |
112
+ rm -f benchmark/data/all_benchmark_data.csv
113
+
114
+ - name: Run benchmarks on GPU
115
+ run: |
116
+ modal run dev.modal.benchmarks
117
+
118
+ # Step 5: Checkout gh-pages branch in a subfolderAdd commentMore actions
119
+ - name: Checkout gh-pages
120
+ uses: actions/checkout@v3
121
+ with:
122
+ ref: gh-pages
123
+ path: gh-pages
124
+
125
+ # Step 6: Copy benchmark CSV to gh-pages directory
126
+ - name: Copy generated benchmark to gh-pages
127
+ id: copy_benchmark
128
+ run: |
129
+ if [[ "${{ github.event_name }}" == "release" ]]; then
130
+ echo "Release event detected"
131
+ path=${{steps.choose_commit.outputs.hash}}-${{ github.event.release.tag_name }}
132
+ else
133
+ echo "Not a release event"
134
+ path=${{steps.choose_commit.outputs.hash}}
135
+ fi
136
+ echo "path=$path" >> $GITHUB_OUTPUT
137
+ COMMIT_DIR="gh-pages/${OUTPUT_DIR}/${path}"
138
+
139
+ mkdir -p "$COMMIT_DIR"
140
+
141
+ if [ -f "$COMMIT_DIR/${OUTPUT_FILENAME}" ]; then
142
+ echo "Removing existing benchmark.csv for this commit"
143
+ rm "$COMMIT_DIR/${OUTPUT_FILENAME}"
144
+ fi
145
+
146
+ cp "${GENERATED_CSV}" "$COMMIT_DIR/${OUTPUT_FILENAME}"
147
+
148
+ # Step 7: Append commit hash to commits.txt if not already present
149
+ - name: Update commits.txt
150
+ run: |
151
+ cd gh-pages
152
+ echo "commits.txt file path: ${OUTPUT_DIR}/commits.txt"
153
+ # Create file if it doesn't exist
154
+ mkdir -p ${OUTPUT_DIR}
155
+ touch ${OUTPUT_DIR}/commits.txt
156
+
157
+ echo "${{ steps.copy_benchmark.outputs.path }}" >> ${OUTPUT_DIR}/commits.txt
158
+
159
+ echo "Added commit hash to commits.txt"
160
+ # Step 7: Commit and push
161
+ - name: Commit and push to gh-pages
162
+ run: |
163
+ cd gh-pages
164
+ git config user.name github-actions[bot]
165
+ git config user.email 41898282+github-actions[bot]@users.noreply.github.com
166
+ git add .
167
+ git commit -m "Add benchmark for commit ${{ steps.copy_benchmark.outputs.path }}" || echo "No changes to commit"
168
+ git push origin gh-pages
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: liger_kernel
3
- Version: 0.6.0
3
+ Version: 0.6.2
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -84,7 +84,7 @@ Dynamic: requires-dist
84
84
  </td>
85
85
  <td style="padding: 10px;">
86
86
  <a href="https://discord.gg/gpumode">
87
- <img src="https://dcbadge.vercel.app/api/server/gpumode?style=flat" alt="Join Our Discord">
87
+ <img src="https://dcbadge.limes.pink/api/server/gpumode?style=flat" alt="Join Our Discord">
88
88
  </a>
89
89
  </td>
90
90
  </tr>
@@ -307,7 +307,7 @@ loss.backward()
307
307
  | Qwen2-VL, & QVQ | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl` | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
308
308
  | Qwen2.5-VL | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_5_vl` | RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
309
309
  | Qwen3 | `liger_kernel.transformers.apply_liger_kernel_to_qwen3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
310
- | Qwen3 MoE | `liger_kernel_transformers.apply_liger_kernel_to_qwen3_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
310
+ | Qwen3 MoE | `liger_kernel.transformers.apply_liger_kernel_to_qwen3_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
311
311
  | Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
312
312
  | Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
313
313
  | OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -400,7 +400,7 @@ loss.backward()
400
400
  </a>
401
401
  </div>
402
402
  <div style="display: block;">
403
- <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml">
403
+ <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml">
404
404
  <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml/badge.svg?event=schedule" alt="Build">
405
405
  </a>
406
406
  </div>
@@ -414,21 +414,19 @@ loss.backward()
414
414
 
415
415
  - For issues, create a Github ticket in this repository
416
416
  - For open discussion, join [our discord channel on GPUMode](https://discord.com/channels/1189498204333543425/1275130785933951039)
417
- - For formal collaboration, send an email to yannchen@linkedin.com and hning@linkedin.com
417
+ - For formal collaboration, send an email to Yanning Chen(yannchen@linkedin.com) and Zhipeng Wang(zhipwang@linkedin.com)
418
418
 
419
419
  ## Cite this work
420
420
 
421
421
  Biblatex entry:
422
422
  ```bib
423
- @article{hsu2024ligerkernelefficienttriton,
424
- title={Liger Kernel: Efficient Triton Kernels for LLM Training},
425
- author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},
426
- year={2024},
427
- eprint={2410.10989},
428
- archivePrefix={arXiv},
429
- primaryClass={cs.LG},
430
- url={https://arxiv.org/abs/2410.10989},
431
- journal={arXiv preprint arXiv:2410.10989},
423
+ @inproceedings{
424
+ hsu2025ligerkernel,
425
+ title={Liger-Kernel: Efficient Triton Kernels for {LLM} Training},
426
+ author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen and Zhipeng Wang},
427
+ booktitle={Championing Open-source DEvelopment in ML Workshop @ ICML25},
428
+ year={2025},
429
+ url={https://openreview.net/forum?id=36SjAIT42G}
432
430
  }
433
431
  ```
434
432
 
@@ -32,7 +32,7 @@
32
32
  </td>
33
33
  <td style="padding: 10px;">
34
34
  <a href="https://discord.gg/gpumode">
35
- <img src="https://dcbadge.vercel.app/api/server/gpumode?style=flat" alt="Join Our Discord">
35
+ <img src="https://dcbadge.limes.pink/api/server/gpumode?style=flat" alt="Join Our Discord">
36
36
  </a>
37
37
  </td>
38
38
  </tr>
@@ -255,7 +255,7 @@ loss.backward()
255
255
  | Qwen2-VL, & QVQ | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl` | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
256
256
  | Qwen2.5-VL | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_5_vl` | RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
257
257
  | Qwen3 | `liger_kernel.transformers.apply_liger_kernel_to_qwen3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
258
- | Qwen3 MoE | `liger_kernel_transformers.apply_liger_kernel_to_qwen3_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
258
+ | Qwen3 MoE | `liger_kernel.transformers.apply_liger_kernel_to_qwen3_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
259
259
  | Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
260
260
  | Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
261
261
  | OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -348,7 +348,7 @@ loss.backward()
348
348
  </a>
349
349
  </div>
350
350
  <div style="display: block;">
351
- <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml">
351
+ <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml">
352
352
  <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml/badge.svg?event=schedule" alt="Build">
353
353
  </a>
354
354
  </div>
@@ -362,21 +362,19 @@ loss.backward()
362
362
 
363
363
  - For issues, create a Github ticket in this repository
364
364
  - For open discussion, join [our discord channel on GPUMode](https://discord.com/channels/1189498204333543425/1275130785933951039)
365
- - For formal collaboration, send an email to yannchen@linkedin.com and hning@linkedin.com
365
+ - For formal collaboration, send an email to Yanning Chen(yannchen@linkedin.com) and Zhipeng Wang(zhipwang@linkedin.com)
366
366
 
367
367
  ## Cite this work
368
368
 
369
369
  Biblatex entry:
370
370
  ```bib
371
- @article{hsu2024ligerkernelefficienttriton,
372
- title={Liger Kernel: Efficient Triton Kernels for LLM Training},
373
- author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},
374
- year={2024},
375
- eprint={2410.10989},
376
- archivePrefix={arXiv},
377
- primaryClass={cs.LG},
378
- url={https://arxiv.org/abs/2410.10989},
379
- journal={arXiv preprint arXiv:2410.10989},
371
+ @inproceedings{
372
+ hsu2025ligerkernel,
373
+ title={Liger-Kernel: Efficient Triton Kernels for {LLM} Training},
374
+ author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen and Zhipeng Wang},
375
+ booktitle={Championing Open-source DEvelopment in ML Workshop @ ICML25},
376
+ year={2025},
377
+ url={https://openreview.net/forum?id=36SjAIT42G}
380
378
  }
381
379
  ```
382
380
 
@@ -625,36 +625,6 @@ group_norm,huggingface,backward,memory,MB,C,num_channels,256,320.5078125,320.507
625
625
  group_norm,huggingface,backward,memory,MB,C,num_channels,512,641.015625,641.015625,641.015625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
626
626
  group_norm,huggingface,backward,memory,MB,C,num_channels,1024,1282.03125,1282.03125,1282.03125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
627
627
  group_norm,huggingface,backward,memory,MB,C,num_channels,2048,2564.0625,2564.0625,2564.0625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
628
- layer_norm,liger,forward,speed,ms,N,hidden size,1024,0.035840000957250595,0.03481600061058998,0.035840000957250595,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
629
- layer_norm,liger,forward,speed,ms,N,hidden size,2048,0.05939200147986412,0.058368001133203506,0.060416001826524734,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
630
- layer_norm,liger,forward,speed,ms,N,hidden size,4096,0.10751999914646149,0.10751999914646149,0.1085439994931221,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
631
- layer_norm,liger,forward,speed,ms,N,hidden size,8192,0.20582400262355804,0.20479999482631683,0.20684799551963806,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
632
- layer_norm,liger,forward,speed,ms,N,hidden size,16384,0.3993600010871887,0.3983359932899475,0.40140798687934875,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
633
- layer_norm,huggingface,forward,speed,ms,N,hidden size,1024,0.03788800165057182,0.03788800165057182,0.03891199827194214,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
634
- layer_norm,huggingface,forward,speed,ms,N,hidden size,2048,0.0655359998345375,0.0655359998345375,0.06656000018119812,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
635
- layer_norm,huggingface,forward,speed,ms,N,hidden size,4096,0.14745600521564484,0.14643199741840363,0.14847999811172485,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
636
- layer_norm,huggingface,forward,speed,ms,N,hidden size,8192,0.31334400177001953,0.3123199939727783,0.31436800956726074,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
637
- layer_norm,huggingface,forward,speed,ms,N,hidden size,16384,0.6133760213851929,0.6123520135879517,0.6154239773750305,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
638
- layer_norm,liger,full,speed,ms,N,hidden size,1024,0.6860799789428711,0.6146048903465271,0.7049216032028198,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
639
- layer_norm,liger,full,speed,ms,N,hidden size,2048,0.6789119839668274,0.6737920045852661,0.6912000179290771,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
640
- layer_norm,liger,full,speed,ms,N,hidden size,4096,0.6686720252037048,0.6635519862174988,0.681984007358551,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
641
- layer_norm,liger,full,speed,ms,N,hidden size,8192,0.6789119839668274,0.5908480286598206,0.6932479739189148,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
642
- layer_norm,liger,full,speed,ms,N,hidden size,16384,6.071296215057373,5.331148624420166,6.08235502243042,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
643
- layer_norm,huggingface,full,speed,ms,N,hidden size,1024,0.13312000036239624,0.13209599256515503,0.13312000036239624,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
644
- layer_norm,huggingface,full,speed,ms,N,hidden size,2048,0.23244799673557281,0.2303999960422516,0.23347200453281403,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
645
- layer_norm,huggingface,full,speed,ms,N,hidden size,4096,0.5242879986763,0.5232639908790588,0.5263360142707825,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
646
- layer_norm,huggingface,full,speed,ms,N,hidden size,8192,1.0168319940567017,1.0147839784622192,1.018880009651184,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
647
- layer_norm,huggingface,full,speed,ms,N,hidden size,16384,1.994752049446106,1.9916800260543823,1.9967999458312988,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
648
- layer_norm,liger,full,memory,MB,N,hidden size,1024,80.90625,80.90625,80.90625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
649
- layer_norm,liger,full,memory,MB,N,hidden size,2048,161.78125,161.78125,161.78125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
650
- layer_norm,liger,full,memory,MB,N,hidden size,4096,323.53125,323.53125,323.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
651
- layer_norm,liger,full,memory,MB,N,hidden size,8192,647.03125,647.03125,647.03125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
652
- layer_norm,liger,full,memory,MB,N,hidden size,16384,1294.03125,1294.03125,1294.03125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
653
- layer_norm,huggingface,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
654
- layer_norm,huggingface,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
655
- layer_norm,huggingface,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
656
- layer_norm,huggingface,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
657
- layer_norm,huggingface,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
658
628
  fused_linear_orpo_loss,liger,forward,speed,ms,B,B,2,116.00621032714844,116.00621032714844,116.00621032714844,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
659
629
  fused_linear_orpo_loss,liger,forward,speed,ms,B,B,4,230.83609008789062,230.83609008789062,230.83609008789062,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
660
630
  fused_linear_orpo_loss,liger,forward,speed,ms,B,B,8,461.9543151855469,461.9543151855469,461.9543151855469,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
@@ -1493,3 +1463,179 @@ distill_cosine_loss,torch,full,memory,MB,BT,B x T,1024,7566.2822265625,7566.2822
1493
1463
  distill_cosine_loss,torch,full,memory,MB,BT,B x T,2048,11590.3134765625,11590.3134765625,11590.3134765625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
1494
1464
  distill_cosine_loss,torch,full,memory,MB,BT,B x T,4096,19654.375,19654.375,19654.375,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
1495
1465
  distill_cosine_loss,torch,full,memory,MB,BT,B x T,8192,35782.5,35782.5,35782.5,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
1466
+ layer_norm,liger,forward,speed,ms,N,hidden size,1024,0.018848000094294548,0.018400000408291817,0.020102400332689285,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1467
+ layer_norm,liger,forward,speed,ms,N,hidden size,2048,0.029152000322937965,0.02876799926161766,0.029823999851942062,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1468
+ layer_norm,liger,forward,speed,ms,N,hidden size,4096,0.05104000121355057,0.05036799982190132,0.05177599936723709,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1469
+ layer_norm,liger,forward,speed,ms,N,hidden size,8192,0.0947519987821579,0.09436800330877304,0.09507200121879578,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1470
+ layer_norm,liger,forward,speed,ms,N,hidden size,16384,0.18476800620555878,0.18396799266338348,0.1852159947156906,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1471
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,1024,0.023584000766277313,0.023423999547958374,0.023840000852942467,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1472
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,2048,0.03734400123357773,0.03702399879693985,0.037811201065778746,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1473
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,4096,0.06617599725723267,0.06560000032186508,0.06678400188684464,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1474
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,8192,0.15267199277877808,0.15190400183200836,0.15347200632095337,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1475
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,16384,0.3067840039730072,0.3046143889427185,0.3081152021884918,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1476
+ layer_norm,liger,backward,speed,ms,N,hidden size,1024,0.12006399780511856,0.11653760075569153,0.12467200309038162,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1477
+ layer_norm,liger,backward,speed,ms,N,hidden size,2048,0.1207360029220581,0.1176128014922142,0.1256511986255646,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1478
+ layer_norm,liger,backward,speed,ms,N,hidden size,4096,0.16630400717258453,0.16412800550460815,0.16838400065898895,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1479
+ layer_norm,liger,backward,speed,ms,N,hidden size,8192,0.31279999017715454,0.31116798520088196,0.3145279884338379,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1480
+ layer_norm,liger,backward,speed,ms,N,hidden size,16384,0.5776320099830627,0.5753471970558167,0.5798912048339844,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1481
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,1024,0.0605119988322258,0.059647999703884125,0.061344001442193985,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1482
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,2048,0.09967999905347824,0.09849599748849869,0.10099200159311295,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1483
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,4096,0.17881600558757782,0.17795200645923615,0.17971199750900269,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1484
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,8192,0.33369600772857666,0.3328000009059906,0.33478400111198425,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1485
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,16384,0.6424000263214111,0.6412223815917969,0.643455982208252,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1486
+ layer_norm,liger,full,speed,ms,N,hidden size,1024,0.26576000452041626,0.2629248082637787,0.2701759934425354,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1487
+ layer_norm,liger,full,speed,ms,N,hidden size,2048,0.27427199482917786,0.26999040842056277,0.28091518878936766,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1488
+ layer_norm,liger,full,speed,ms,N,hidden size,4096,0.27454400062561035,0.27004799246788025,0.2807359993457794,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1489
+ layer_norm,liger,full,speed,ms,N,hidden size,8192,0.40556800365448,0.40403199195861816,0.40723198652267456,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1490
+ layer_norm,liger,full,speed,ms,N,hidden size,16384,0.7608960270881653,0.7589311957359314,0.7631679773330688,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1491
+ layer_norm,huggingface,full,speed,ms,N,hidden size,1024,0.08025600016117096,0.07942400127649307,0.08111999928951263,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1492
+ layer_norm,huggingface,full,speed,ms,N,hidden size,2048,0.13315199315547943,0.13180799782276154,0.13468800485134125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1493
+ layer_norm,huggingface,full,speed,ms,N,hidden size,4096,0.2417600005865097,0.24089600145816803,0.24262399971485138,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1494
+ layer_norm,huggingface,full,speed,ms,N,hidden size,8192,0.4832639992237091,0.48214399814605713,0.4843647956848145,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1495
+ layer_norm,huggingface,full,speed,ms,N,hidden size,16384,0.950575977563858,0.9484800100326538,0.9528064012527466,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1496
+ layer_norm,liger,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1497
+ layer_norm,liger,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1498
+ layer_norm,liger,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1499
+ layer_norm,liger,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1500
+ layer_norm,liger,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1501
+ layer_norm,huggingface,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1502
+ layer_norm,huggingface,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1503
+ layer_norm,huggingface,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1504
+ layer_norm,huggingface,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1505
+ layer_norm,huggingface,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1506
+ fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,1024,0.01759999990463257,0.017311999574303627,0.017920000478625298,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
1507
+ fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,2048,0.02924799919128418,0.028863999992609024,0.029983999207615852,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
1508
+ fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,4096,0.05129599943757057,0.050624001771211624,0.05209600180387497,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
1509
+ fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,8192,0.09344000369310379,0.09296000003814697,0.09382399916648865,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
1510
+ fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,16384,0.1791680008172989,0.17814399302005768,0.1796800047159195,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
1511
+ fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,32768,0.43830400705337524,0.43744000792503357,0.43929600715637207,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
1512
+ fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,1024,0.060095999389886856,0.059808000922203064,0.06054399907588959,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
1513
+ fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,2048,0.09084799885749817,0.09027200192213058,0.09161599725484848,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
1514
+ fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,4096,0.17820799350738525,0.17744000256061554,0.17897599935531616,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
1515
+ fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,8192,0.312608003616333,0.3118720054626465,0.31324800848960876,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
1516
+ fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,16384,0.574944019317627,0.5740479826927185,0.5756288051605225,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
1517
+ fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,32768,1.0943039655685425,1.0934272289276123,1.0951999425888062,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
1518
+ fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,1024,0.0352960005402565,0.03481600061058998,0.03811199963092804,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
1519
+ fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,2048,0.05430399999022484,0.05392000079154968,0.05503999814391136,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
1520
+ fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,4096,0.10592000186443329,0.1054655984044075,0.10630399733781815,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
1521
+ fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,8192,0.19679999351501465,0.19631999731063843,0.19724799692630768,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
1522
+ fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,16384,0.37436801195144653,0.3733760118484497,0.3752320110797882,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
1523
+ fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,32768,0.7376000285148621,0.7361343741416931,0.7391359806060791,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
1524
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,1024,0.3147200047969818,0.30796160697937014,0.32764801383018494,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
1525
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,2048,0.3089919984340668,0.30374398827552795,0.3226880133152008,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
1526
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,4096,0.30691200494766235,0.3023296058177948,0.3205504059791565,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
1527
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,8192,0.3246079981327057,0.3185984075069428,0.33656961321830753,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
1528
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,16384,0.6010559797286987,0.5996800065040588,0.6026239991188049,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
1529
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,32768,1.8402559757232666,1.8322880268096924,1.8461120128631592,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
1530
+ fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,1024,0.23878400027751923,0.23545600473880768,0.2507520020008087,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
1531
+ fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,2048,0.34513600170612335,0.34377598762512207,0.34678399562835693,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
1532
+ fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,4096,0.6330879926681519,0.631712019443512,0.6345599889755249,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
1533
+ fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,8192,1.1185599565505981,1.1172800064086914,1.1196800470352173,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
1534
+ fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,16384,2.0697600841522217,2.0678528785705566,2.0713536739349365,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
1535
+ fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,32768,3.9561920166015625,3.953824043273926,3.9581120014190674,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
1536
+ fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,1024,0.38916800916194916,0.3824320137500763,0.4037184059619903,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
1537
+ fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,2048,0.3890720009803772,0.38193280100822447,0.4032831907272339,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
1538
+ fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,4096,0.39715200662612915,0.3928639888763428,0.41097599267959595,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
1539
+ fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,8192,0.6275200247764587,0.6259520053863525,0.6287999749183655,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
1540
+ fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,16384,1.202239990234375,1.199679970741272,1.2048959732055664,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
1541
+ fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,32768,2.7738559246063232,2.7705343723297116,2.777868890762329,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
1542
+ fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,1024,0.15619200468063354,0.15376000106334686,0.1661248028278351,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
1543
+ fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,2048,0.15825600177049637,0.15600000321865082,0.16911999881267548,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
1544
+ fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,4096,0.16700799763202667,0.16502399742603302,0.1709440052509308,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
1545
+ fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,8192,0.1712000072002411,0.1700800061225891,0.17215999960899353,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
1546
+ fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,16384,0.42505601048469543,0.4233280122280121,0.42691200971603394,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
1547
+ fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,32768,1.4057759642601013,1.3944000005722046,1.4099839925765991,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
1548
+ fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,1024,0.1520960032939911,0.15136000514030457,0.1528960019350052,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
1549
+ fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,2048,0.2533760070800781,0.2524160146713257,0.25436800718307495,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
1550
+ fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,4096,0.4551039934158325,0.4540799856185913,0.45612800121307373,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
1551
+ fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,8192,0.8053439855575562,0.8038079738616943,0.806656002998352,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
1552
+ fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,16384,1.4933120012283325,1.492095947265625,1.49452805519104,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
1553
+ fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,32768,2.8600640296936035,2.8583295822143557,2.8612607955932616,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
1554
+ fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,1024,0.20175999402999878,0.199072003364563,0.2154303938150406,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1555
+ fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,2048,0.20263999700546265,0.20000000298023224,0.21675519943237304,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1556
+ fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,4096,0.25276800990104675,0.2515519857406616,0.2539199888706207,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1557
+ fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,8192,0.4322720021009445,0.43088001012802124,0.4336000084877014,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1558
+ fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,16384,0.8288000226020813,0.8266303777694701,0.8311295866966247,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1559
+ fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,32768,2.03987193107605,2.0360767364501955,2.0436416149139403,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1560
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,1024,72.546875,72.546875,72.546875,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1561
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,2048,145.0859375,145.0859375,145.0859375,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1562
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,4096,290.1640625,290.1640625,290.1640625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1563
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,8192,580.3203125,580.3203125,580.3203125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1564
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,16384,1160.6328125,1160.6328125,1160.6328125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1565
+ fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,32768,2321.2578125,2321.2578125,2321.2578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1566
+ fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,1024,104.03173828125,104.03173828125,104.03173828125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1567
+ fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,2048,208.05517578125,208.05517578125,208.05517578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1568
+ fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,4096,416.10205078125,416.10205078125,416.10205078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1569
+ fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,8192,832.19580078125,832.19580078125,832.19580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1570
+ fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,16384,1664.3125,1664.3125,1664.3125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1571
+ fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,32768,3328.625,3328.625,3328.625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1572
+ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,1024,104.03564453125,104.03564453125,104.03564453125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1573
+ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,2048,208.06298828125,208.06298828125,208.06298828125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1574
+ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,4096,416.11767578125,416.11767578125,416.11767578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1575
+ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,8192,832.22705078125,832.22705078125,832.22705078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1576
+ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,16384,1544.44580078125,1544.44580078125,1544.44580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1577
+ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1578
+ llama4_rope,liger,forward,speed,ms,H,hidden size,512,0.08249600231647491,0.08102399855852127,0.08432000130414963,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
1579
+ llama4_rope,liger,forward,speed,ms,H,hidden size,2048,0.08169600367546082,0.08037760108709335,0.08329600095748901,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
1580
+ llama4_rope,liger,forward,speed,ms,H,hidden size,8192,0.08128000050783157,0.07980799674987793,0.08329600095748901,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
1581
+ llama4_rope,huggingface,forward,speed,ms,H,hidden size,512,0.03759999945759773,0.03612799942493439,0.03907199949026108,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:03,0.6.1
1582
+ llama4_rope,huggingface,forward,speed,ms,H,hidden size,2048,0.06185600161552429,0.061267200857400894,0.06252799928188324,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:03,0.6.1
1583
+ llama4_rope,huggingface,forward,speed,ms,H,hidden size,8192,0.206496000289917,0.20582400262355804,0.20716799795627594,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:03,0.6.1
1584
+ llama4_rope,liger,backward,speed,ms,H,hidden size,512,0.15404799580574036,0.15241600573062897,0.15615999698638916,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:04,0.6.1
1585
+ llama4_rope,liger,backward,speed,ms,H,hidden size,2048,0.1536320000886917,0.15190400183200836,0.1558080017566681,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:04,0.6.1
1586
+ llama4_rope,liger,backward,speed,ms,H,hidden size,8192,0.15263999998569489,0.15094399452209473,0.15491199493408203,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:04,0.6.1
1587
+ llama4_rope,huggingface,backward,speed,ms,H,hidden size,512,0.13760000467300415,0.13574400544166565,0.14009599387645721,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:05,0.6.1
1588
+ llama4_rope,huggingface,backward,speed,ms,H,hidden size,2048,0.13600000739097595,0.13449600338935852,0.1382720023393631,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:05,0.6.1
1589
+ llama4_rope,huggingface,backward,speed,ms,H,hidden size,8192,0.21011200547218323,0.20924800634384155,0.21110400557518005,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:05,0.6.1
1590
+ llama4_rope,liger,full,speed,ms,H,hidden size,512,0.3652159869670868,0.3619840145111084,0.3699840009212494,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:07,0.6.1
1591
+ llama4_rope,liger,full,speed,ms,H,hidden size,2048,0.3599040061235428,0.2881920039653778,0.36559998989105225,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:07,0.6.1
1592
+ llama4_rope,liger,full,speed,ms,H,hidden size,8192,0.2874239981174469,0.2852480113506317,0.29029120206832887,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:07,0.6.1
1593
+ llama4_rope,huggingface,full,speed,ms,H,hidden size,512,0.24691200256347656,0.24489599466323853,0.24961919784545897,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1594
+ llama4_rope,huggingface,full,speed,ms,H,hidden size,2048,0.24774399399757385,0.24582399427890778,0.2505407989025116,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1595
+ llama4_rope,huggingface,full,speed,ms,H,hidden size,8192,0.41414400935173035,0.41337600350379944,0.41491198539733887,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1596
+ llama4_rope,liger,full,memory,MB,H,hidden size,512,37.23486328125,37.23486328125,37.23486328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1597
+ llama4_rope,liger,full,memory,MB,H,hidden size,2048,52.89111328125,52.89111328125,52.89111328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1598
+ llama4_rope,liger,full,memory,MB,H,hidden size,8192,115.51611328125,115.51611328125,115.51611328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1599
+ llama4_rope,huggingface,full,memory,MB,H,hidden size,512,49.64111328125,49.64111328125,49.64111328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1600
+ llama4_rope,huggingface,full,memory,MB,H,hidden size,2048,102.51611328125,102.51611328125,102.51611328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1601
+ llama4_rope,huggingface,full,memory,MB,H,hidden size,8192,314.01611328125,314.01611328125,314.01611328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
1602
+ llama4_rope,liger,forward,speed,ms,T,sequence length,1024,0.07417599856853485,0.07248000055551529,0.07596799731254578,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
1603
+ llama4_rope,liger,forward,speed,ms,T,sequence length,2048,0.08182399719953537,0.08006399869918823,0.08380799740552902,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
1604
+ llama4_rope,liger,forward,speed,ms,T,sequence length,4096,0.11708799749612808,0.1167680025100708,0.11744000017642975,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
1605
+ llama4_rope,liger,forward,speed,ms,T,sequence length,8192,0.2165440022945404,0.21596799790859222,0.21715199947357178,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
1606
+ llama4_rope,liger,forward,speed,ms,T,sequence length,16384,0.41756799817085266,0.41705599427223206,0.41811200976371765,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
1607
+ llama4_rope,huggingface,forward,speed,ms,T,sequence length,1024,0.11644800007343292,0.11590400338172913,0.11708799749612808,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
1608
+ llama4_rope,huggingface,forward,speed,ms,T,sequence length,2048,0.20659199357032776,0.20608000457286835,0.2072640061378479,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
1609
+ llama4_rope,huggingface,forward,speed,ms,T,sequence length,4096,0.38553598523139954,0.3846847891807556,0.38624000549316406,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
1610
+ llama4_rope,huggingface,forward,speed,ms,T,sequence length,8192,0.7411519885063171,0.7403839826583862,0.7420480251312256,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
1611
+ llama4_rope,huggingface,forward,speed,ms,T,sequence length,16384,1.4553920030593872,1.4543871641159059,1.4562879800796509,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
1612
+ llama4_rope,liger,backward,speed,ms,T,sequence length,1024,0.11840000003576279,0.11711999773979187,0.12031999975442886,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
1613
+ llama4_rope,liger,backward,speed,ms,T,sequence length,2048,0.12336000055074692,0.12198399752378464,0.12489599734544754,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
1614
+ llama4_rope,liger,backward,speed,ms,T,sequence length,4096,0.12380799651145935,0.12240000069141388,0.12559999525547028,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
1615
+ llama4_rope,liger,backward,speed,ms,T,sequence length,8192,0.2170879989862442,0.2165759950876236,0.21753600239753723,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
1616
+ llama4_rope,liger,backward,speed,ms,T,sequence length,16384,0.4175359904766083,0.41705599427223206,0.4181375920772552,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
1617
+ llama4_rope,huggingface,backward,speed,ms,T,sequence length,1024,0.1189119964838028,0.11769600212574005,0.12003199756145477,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
1618
+ llama4_rope,huggingface,backward,speed,ms,T,sequence length,2048,0.21011200547218323,0.20927999913692474,0.21119999885559082,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
1619
+ llama4_rope,huggingface,backward,speed,ms,T,sequence length,4096,0.39740800857543945,0.3963199853897095,0.39824000000953674,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
1620
+ llama4_rope,huggingface,backward,speed,ms,T,sequence length,8192,0.7540159821510315,0.7528960108757019,0.7550719976425171,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
1621
+ llama4_rope,huggingface,backward,speed,ms,T,sequence length,16384,1.4822720289230347,1.4810559749603271,1.4833600521087646,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
1622
+ llama4_rope,liger,full,speed,ms,T,sequence length,1024,0.2874400019645691,0.2853440046310425,0.29052799940109253,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
1623
+ llama4_rope,liger,full,speed,ms,T,sequence length,2048,0.28646400570869446,0.2845759987831116,0.28963199257850647,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
1624
+ llama4_rope,liger,full,speed,ms,T,sequence length,4096,0.29897600412368774,0.29660800099372864,0.302131199836731,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
1625
+ llama4_rope,liger,full,speed,ms,T,sequence length,8192,0.4315840005874634,0.4304639995098114,0.43270400166511536,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
1626
+ llama4_rope,liger,full,speed,ms,T,sequence length,16384,0.833184003829956,0.8322240114212036,0.8345024228096007,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
1627
+ llama4_rope,huggingface,full,speed,ms,T,sequence length,1024,0.24592000246047974,0.24396799504756927,0.24876800179481506,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1628
+ llama4_rope,huggingface,full,speed,ms,T,sequence length,2048,0.4138239920139313,0.41308799386024475,0.4145599901676178,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1629
+ llama4_rope,huggingface,full,speed,ms,T,sequence length,4096,0.7800959944725037,0.7790719866752625,0.7810239791870117,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1630
+ llama4_rope,huggingface,full,speed,ms,T,sequence length,8192,1.4911680221557617,1.4902976036071778,1.4922879934310913,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1631
+ llama4_rope,huggingface,full,speed,ms,T,sequence length,16384,2.9344160556793213,2.9333438873291016,2.9353599548339844,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1632
+ llama4_rope,liger,full,memory,MB,T,sequence length,1024,73.75830078125,73.75830078125,73.75830078125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1633
+ llama4_rope,liger,full,memory,MB,T,sequence length,2048,115.51611328125,115.51611328125,115.51611328125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1634
+ llama4_rope,liger,full,memory,MB,T,sequence length,4096,199.03173828125,199.03173828125,199.03173828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1635
+ llama4_rope,liger,full,memory,MB,T,sequence length,8192,366.06298828125,366.06298828125,366.06298828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1636
+ llama4_rope,liger,full,memory,MB,T,sequence length,16384,700.12548828125,700.12548828125,700.12548828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1637
+ llama4_rope,huggingface,full,memory,MB,T,sequence length,1024,173.00830078125,173.00830078125,173.00830078125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1638
+ llama4_rope,huggingface,full,memory,MB,T,sequence length,2048,314.01611328125,314.01611328125,314.01611328125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1639
+ llama4_rope,huggingface,full,memory,MB,T,sequence length,4096,596.03173828125,596.03173828125,596.03173828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1640
+ llama4_rope,huggingface,full,memory,MB,T,sequence length,8192,1160.06298828125,1160.06298828125,1160.06298828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1641
+ llama4_rope,huggingface,full,memory,MB,T,sequence length,16384,2288.12548828125,2288.12548828125,2288.12548828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1