liger-kernel 0.5.4__tar.gz → 0.5.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/.github/workflows/amd-ci.yml +5 -1
  2. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/PKG-INFO +11 -7
  3. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/README.md +8 -5
  4. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/benchmarks_visualizer.py +2 -2
  5. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/data/all_benchmark_data.csv +30 -31
  6. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_distill_jsd_loss.py +2 -0
  7. liger_kernel-0.5.6/benchmark/scripts/benchmark_dyt.py +139 -0
  8. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_kto_loss.py +4 -4
  9. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/dev/modal/tests.py +1 -1
  10. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/dev/modal/tests_bwd.py +1 -1
  11. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/pyproject.toml +1 -1
  12. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/chunked_loss/cpo_loss.py +51 -11
  13. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/chunked_loss/dpo_loss.py +30 -4
  14. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/chunked_loss/functional.py +2 -0
  15. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +20 -5
  16. liger_kernel-0.5.6/src/liger_kernel/chunked_loss/fused_linear_ppo.py +331 -0
  17. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/chunked_loss/fused_linear_preference.py +2 -2
  18. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +112 -17
  19. liger_kernel-0.5.6/src/liger_kernel/chunked_loss/grpo_loss.py +236 -0
  20. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/chunked_loss/jsd_loss.py +43 -13
  21. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/chunked_loss/kto_loss.py +50 -12
  22. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/chunked_loss/orpo_loss.py +37 -5
  23. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/chunked_loss/simpo_loss.py +47 -11
  24. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/cross_entropy.py +7 -2
  25. liger_kernel-0.5.6/src/liger_kernel/ops/dyt.py +225 -0
  26. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/fused_linear_jsd.py +2 -1
  27. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/jsd.py +30 -11
  28. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/__init__.py +4 -0
  29. liger_kernel-0.5.6/src/liger_kernel/transformers/dyt.py +20 -0
  30. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/functional.py +5 -0
  31. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/model/gemma.py +8 -16
  32. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/model/gemma2.py +7 -16
  33. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/model/llama.py +8 -15
  34. liger_kernel-0.5.6/src/liger_kernel/transformers/model/llava.py +369 -0
  35. liger_kernel-0.5.6/src/liger_kernel/transformers/model/loss_utils.py +57 -0
  36. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/model/mistral.py +9 -10
  37. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/model/mixtral.py +8 -15
  38. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/model/mllama.py +8 -15
  39. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/model/olmo2.py +8 -16
  40. liger_kernel-0.5.6/src/liger_kernel/transformers/model/paligemma.py +397 -0
  41. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/model/phi3.py +8 -15
  42. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/model/qwen2.py +8 -15
  43. liger_kernel-0.5.6/src/liger_kernel/transformers/model/qwen2_5_vl.py +204 -0
  44. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/model/qwen2_vl.py +9 -10
  45. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/monkey_patch.py +286 -12
  46. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/utils.py +1 -3
  47. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel.egg-info/PKG-INFO +11 -7
  48. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel.egg-info/SOURCES.txt +14 -1
  49. liger_kernel-0.5.6/test/chunked_loss/test_grpo_loss.py +470 -0
  50. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/chunked_loss/test_jsd_loss.py +64 -20
  51. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/chunked_loss/test_kto_loss.py +85 -8
  52. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/chunked_loss/test_orpo_loss.py +6 -0
  53. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/convergence/bf16/test_mini_models.py +179 -0
  54. liger_kernel-0.5.6/test/convergence/bf16/test_mini_models_multimodal.py +835 -0
  55. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/convergence/bf16/test_mini_models_with_logits.py +202 -22
  56. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/convergence/fp32/test_mini_models.py +173 -0
  57. liger_kernel-0.5.6/test/convergence/fp32/test_mini_models_multimodal.py +820 -0
  58. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/convergence/fp32/test_mini_models_with_logits.py +176 -1
  59. liger_kernel-0.5.6/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +61 -0
  60. liger_kernel-0.5.6/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +28 -0
  61. liger_kernel-0.5.6/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +7 -0
  62. liger_kernel-0.5.6/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +66 -0
  63. liger_kernel-0.5.6/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +63 -0
  64. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_cross_entropy.py +39 -0
  65. liger_kernel-0.5.6/test/transformers/test_dyt.py +136 -0
  66. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_jsd.py +5 -5
  67. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_monkey_patch.py +68 -0
  68. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/utils.py +70 -3
  69. liger_kernel-0.5.4/src/liger_kernel/chunked_loss/fused_linear_rlhf.py +0 -213
  70. liger_kernel-0.5.4/src/liger_kernel/chunked_loss/grpo_loss.py +0 -160
  71. liger_kernel-0.5.4/test/chunked_loss/test_grpo_loss.py +0 -275
  72. liger_kernel-0.5.4/test/convergence/bf16/test_mini_models_multimodal.py +0 -421
  73. liger_kernel-0.5.4/test/convergence/fp32/test_mini_models_multimodal.py +0 -415
  74. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  75. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  76. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/.github/pull_request_template.md +0 -0
  77. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/.github/workflows/docs.yml +0 -0
  78. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/.github/workflows/intel-ci.yml +0 -0
  79. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/.github/workflows/nvi-ci.yml +0 -0
  80. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/.github/workflows/publish-nightly.yml +0 -0
  81. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/.github/workflows/publish-release.yml +0 -0
  82. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/.gitignore +0 -0
  83. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/LICENSE +0 -0
  84. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/Makefile +0 -0
  85. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/NOTICE +0 -0
  86. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/README.md +0 -0
  87. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/__init__.py +0 -0
  88. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/__init__.py +0 -0
  89. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
  90. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
  91. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
  92. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_embedding.py +0 -0
  93. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
  94. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
  95. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_geglu.py +0 -0
  96. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_group_norm.py +0 -0
  97. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_jsd.py +0 -0
  98. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_kl_div.py +0 -0
  99. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_layer_norm.py +0 -0
  100. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
  101. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
  102. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_rms_norm.py +0 -0
  103. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_rope.py +0 -0
  104. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
  105. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_swiglu.py +0 -0
  106. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/benchmark_tvd.py +0 -0
  107. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/benchmark/scripts/utils.py +0 -0
  108. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/dev/fmt-requirements.txt +0 -0
  109. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/Examples.md +0 -0
  110. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/Getting-Started.md +0 -0
  111. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/High-Level-APIs.md +0 -0
  112. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/Low-Level-APIs.md +0 -0
  113. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/acknowledgement.md +0 -0
  114. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/contributing.md +0 -0
  115. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/images/banner.GIF +0 -0
  116. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/images/compose.gif +0 -0
  117. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/images/e2e-memory.png +0 -0
  118. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/images/e2e-tps.png +0 -0
  119. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/images/logo-banner.png +0 -0
  120. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/images/patch.gif +0 -0
  121. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/images/post-training.png +0 -0
  122. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/index.md +0 -0
  123. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/docs/license.md +0 -0
  124. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/alignment/accelerate_config.yaml +0 -0
  125. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/alignment/run_orpo.py +0 -0
  126. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/README.md +0 -0
  127. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/callback.py +0 -0
  128. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/config/fsdp_config.json +0 -0
  129. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/img/gemma_7b_mem.png +0 -0
  130. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/img/gemma_7b_tp.png +0 -0
  131. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/img/llama_mem_alloc.png +0 -0
  132. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/img/llama_tps.png +0 -0
  133. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
  134. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/img/qwen_tps.png +0 -0
  135. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/launch_on_modal.py +0 -0
  136. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/requirements.txt +0 -0
  137. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/run_benchmarks.sh +0 -0
  138. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/run_gemma.sh +0 -0
  139. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/run_llama.sh +0 -0
  140. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/run_qwen.sh +0 -0
  141. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/run_qwen2_vl.sh +0 -0
  142. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/training.py +0 -0
  143. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/huggingface/training_multimodal.py +0 -0
  144. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/lightning/README.md +0 -0
  145. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/lightning/requirements.txt +0 -0
  146. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/lightning/training.py +0 -0
  147. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/README.md +0 -0
  148. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/callback.py +0 -0
  149. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
  150. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
  151. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
  152. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
  153. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
  154. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
  155. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
  156. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
  157. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
  158. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/medusa_util.py +0 -0
  159. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/requirements.txt +0 -0
  160. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
  161. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/examples/medusa/train.py +0 -0
  162. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/licenses/LICENSE-Apache-2.0 +0 -0
  163. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/licenses/LICENSE-MIT-AutoAWQ +0 -0
  164. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
  165. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/licenses/LICENSE-MIT-llmc +0 -0
  166. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/licenses/LICENSE-MIT-triton +0 -0
  167. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/mkdocs.yml +0 -0
  168. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/setup.cfg +0 -0
  169. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/setup.py +0 -0
  170. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/__init__.py +0 -0
  171. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/chunked_loss/README.md +0 -0
  172. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/chunked_loss/__init__.py +0 -0
  173. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/env_report.py +0 -0
  174. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/__init__.py +0 -0
  175. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  176. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
  177. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
  178. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/geglu.py +0 -0
  179. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/group_norm.py +0 -0
  180. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/kl_div.py +2 -2
  181. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/layer_norm.py +0 -0
  182. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
  183. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/rms_norm.py +0 -0
  184. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/rope.py +0 -0
  185. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/swiglu.py +0 -0
  186. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/tvd.py +0 -0
  187. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/ops/utils.py +0 -0
  188. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/auto_model.py +0 -0
  189. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/cross_entropy.py +0 -0
  190. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  191. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
  192. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
  193. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/geglu.py +0 -0
  194. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/group_norm.py +0 -0
  195. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/jsd.py +0 -0
  196. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/kl_div.py +0 -0
  197. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/layer_norm.py +0 -0
  198. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/model/__init__.py +0 -0
  199. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
  200. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/rms_norm.py +0 -0
  201. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/rope.py +0 -0
  202. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/swiglu.py +0 -0
  203. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
  204. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
  205. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  206. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/transformers/tvd.py +0 -0
  207. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/triton/__init__.py +0 -0
  208. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel/triton/monkey_patch.py +0 -0
  209. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel.egg-info/dependency_links.txt +0 -0
  210. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel.egg-info/requires.txt +0 -0
  211. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/src/liger_kernel.egg-info/top_level.txt +0 -0
  212. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/__init__.py +0 -0
  213. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/chunked_loss/__init__.py +0 -0
  214. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/chunked_loss/test_cpo_loss.py +0 -0
  215. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/chunked_loss/test_dpo_loss.py +0 -0
  216. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/chunked_loss/test_simpo_loss.py +0 -0
  217. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/conftest.py +0 -0
  218. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/convergence/__init__.py +0 -0
  219. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/convergence/bf16/__init__.py +0 -0
  220. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/convergence/fp32/__init__.py +0 -0
  221. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
  222. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
  223. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
  224. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/resources/tiny_shakespeare.txt +0 -0
  225. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
  226. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
  227. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
  228. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_auto_model.py +0 -0
  229. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_embedding.py +0 -0
  230. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_flex_attention.py +0 -0
  231. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
  232. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_fused_linear_jsd.py +0 -0
  233. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_geglu.py +0 -0
  234. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_group_norm.py +0 -0
  235. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_kl_div.py +0 -0
  236. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_layer_norm.py +0 -0
  237. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_mm_int8int2.py +0 -0
  238. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_qwen2vl_mrope.py +0 -0
  239. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_rms_norm.py +0 -0
  240. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_rope.py +0 -0
  241. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_swiglu.py +0 -0
  242. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_trainer_integration.py +0 -0
  243. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_transformers.py +0 -0
  244. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/transformers/test_tvd.py +0 -0
  245. {liger_kernel-0.5.4 → liger_kernel-0.5.6}/test/triton/test_triton_monkey_patch.py +0 -0
@@ -47,6 +47,9 @@ jobs:
47
47
  tests:
48
48
  runs-on: linux-mi300-gpu-1
49
49
  needs: [checkstyle]
50
+ strategy:
51
+ matrix:
52
+ rocm_version: ['6.3']
50
53
 
51
54
  steps:
52
55
  - name: Checkout code
@@ -59,8 +62,9 @@ jobs:
59
62
 
60
63
  - name: Setup Dependencies
61
64
  run: |
65
+ rocm-smi
62
66
  python -m pip install --upgrade pip
63
- pip install -e .[dev] --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2
67
+ pip install -e .[dev] --extra-index-url https://download.pytorch.org/whl/nightly/rocm${{ matrix.rocm_version }}
64
68
 
65
69
  - name: List Python Environments
66
70
  run: python -m pip list
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: liger_kernel
3
- Version: 0.5.4
3
+ Version: 0.5.6
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -45,6 +45,7 @@ Requires-Dist: datasets>=2.19.2; extra == "dev"
45
45
  Requires-Dist: seaborn; extra == "dev"
46
46
  Requires-Dist: mkdocs; extra == "dev"
47
47
  Requires-Dist: mkdocs-material; extra == "dev"
48
+ Dynamic: license-file
48
49
  Dynamic: provides-extra
49
50
  Dynamic: requires-dist
50
51
 
@@ -115,6 +116,7 @@ Dynamic: requires-dist
115
116
  <details>
116
117
  <summary>Latest News 🔥</summary>
117
118
 
119
+ - [2025/03/06] We release a joint blog post on TorchTune × Liger - [Peak Performance, Minimized Memory: Optimizing torchtune’s performance with torch.compile & Liger Kernel](https://pytorch.org/blog/peak-performance-minimized-memory/)
118
120
  - [2024/12/11] We release [v0.5.0](https://github.com/linkedin/Liger-Kernel/releases/tag/v0.5.0): 80% more memory efficient post training losses (DPO, ORPO, CPO, etc)!
119
121
  - [2024/12/5] We release LinkedIn Engineering Blog - [Liger-Kernel: Empowering an open source ecosystem of Triton Kernels for Efficient LLM Training](https://www.linkedin.com/blog/engineering/open-source/liger-kernel-open-source-ecosystem-for-efficient-llm-training)
120
122
  - [2024/11/6] We release [v0.4.0](https://github.com/linkedin/Liger-Kernel/releases/tag/v0.4.0): Full AMD support, Tech Report, Modal CI, Llama-3.2-Vision!
@@ -154,7 +156,7 @@ With one line of code, Liger Kernel can increase throughput by more than 20% and
154
156
  We provide optimized post training kernels like DPO, ORPO, SimPO, and more which can reduce memory usage by up to 80%. You can easily use them as python modules.
155
157
 
156
158
  ```python
157
- from liger_kernel.chunked_loss import LigerFusedLinearDPOLoss
159
+ from liger_kernel.chunked_loss import LigerFusedLinearORPOLoss
158
160
  orpo_loss = LigerFusedLinearORPOLoss()
159
161
  y = orpo_loss(lm_head.weight, x, target)
160
162
  ```
@@ -177,7 +179,7 @@ y = orpo_loss(lm_head.weight, x, target)
177
179
  - **Exact:** Computation is exact—no approximations! Both forward and backward passes are implemented with rigorous unit tests and undergo convergence testing against training runs without Liger Kernel to ensure accuracy.
178
180
  - **Lightweight:** Liger Kernel has minimal dependencies, requiring only Torch and Triton—no extra libraries needed! Say goodbye to dependency headaches!
179
181
  - **Multi-GPU supported:** Compatible with multi-GPU setups (PyTorch FSDP, DeepSpeed, DDP, etc.).
180
- - **Trainer Framework Integration**: [Axolotl](https://github.com/axolotl-ai-cloud/axolotl), [LLaMa-Factory](https://github.com/hiyouga/LLaMA-Factory), [SFTTrainer](https://github.com/huggingface/trl/releases/tag/v0.10.1), [Hugging Face Trainer](https://github.com/huggingface/transformers/pull/32860), [SWIFT](https://github.com/modelscope/ms-swift)
182
+ - **Trainer Framework Integration**: [Axolotl](https://github.com/axolotl-ai-cloud/axolotl), [LLaMa-Factory](https://github.com/hiyouga/LLaMA-Factory), [SFTTrainer](https://github.com/huggingface/trl/releases/tag/v0.10.1), [Hugging Face Trainer](https://github.com/huggingface/transformers/pull/32860), [SWIFT](https://github.com/modelscope/ms-swift), [oumi](https://github.com/oumi-ai/oumi/tree/main)
181
183
 
182
184
  ## Installation
183
185
 
@@ -312,8 +314,10 @@ loss.backward()
312
314
  | Mixtral | `liger_kernel.transformers.apply_liger_kernel_to_mixtral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
313
315
  | Gemma1 | `liger_kernel.transformers.apply_liger_kernel_to_gemma` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
314
316
  | Gemma2 | `liger_kernel.transformers.apply_liger_kernel_to_gemma2` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
317
+ | Paligemma, Paligemma2, & Paligemma2 Mix | `liger_kernel.transformers.apply_liger_kernel_to_paligemma` | LayerNorm, RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
315
318
  | Qwen2, Qwen2.5, & QwQ | `liger_kernel.transformers.apply_liger_kernel_to_qwen2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
316
319
  | Qwen2-VL, & QVQ | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl` | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
320
+ | Qwen2.5-VL | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_5_vl` | RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
317
321
  | Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
318
322
  | Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
319
323
  | OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -385,8 +389,8 @@ loss.backward()
385
389
  ## Contact
386
390
 
387
391
  - For issues, create a Github ticket in this repository
388
- - For open discussion, join [our discord channel](https://discord.gg/gpumode)
389
- - For formal collaboration, send an email to yannchen@linkedin.com
392
+ - For open discussion, join [our discord channel on GPUMode](https://discord.com/channels/1189498204333543425/1275130785933951039)
393
+ - For formal collaboration, send an email to yannchen@linkedin.com and hning@linkedin.com
390
394
 
391
395
  ## Cite this work
392
396
 
@@ -405,7 +409,7 @@ Biblatex entry:
405
409
  ```
406
410
 
407
411
  ## Star History
408
- [![Star History Chart](https://api.star-history.com/svg?repos=linkedin/Liger-Kernel&type=Date)](https://star-history.com/#linkedin/Liger-Kernel&Date)
412
+ [![Star History Chart](https://api.star-history.com/svg?repos=linkedin/Liger-Kernel&type=Date)](https://www.star-history.com/#linkedin/Liger-Kernel&Date)
409
413
 
410
414
  <p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
411
415
  <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
@@ -65,6 +65,7 @@
65
65
  <details>
66
66
  <summary>Latest News 🔥</summary>
67
67
 
68
+ - [2025/03/06] We release a joint blog post on TorchTune × Liger - [Peak Performance, Minimized Memory: Optimizing torchtune’s performance with torch.compile & Liger Kernel](https://pytorch.org/blog/peak-performance-minimized-memory/)
68
69
  - [2024/12/11] We release [v0.5.0](https://github.com/linkedin/Liger-Kernel/releases/tag/v0.5.0): 80% more memory efficient post training losses (DPO, ORPO, CPO, etc)!
69
70
  - [2024/12/5] We release LinkedIn Engineering Blog - [Liger-Kernel: Empowering an open source ecosystem of Triton Kernels for Efficient LLM Training](https://www.linkedin.com/blog/engineering/open-source/liger-kernel-open-source-ecosystem-for-efficient-llm-training)
70
71
  - [2024/11/6] We release [v0.4.0](https://github.com/linkedin/Liger-Kernel/releases/tag/v0.4.0): Full AMD support, Tech Report, Modal CI, Llama-3.2-Vision!
@@ -104,7 +105,7 @@ With one line of code, Liger Kernel can increase throughput by more than 20% and
104
105
  We provide optimized post training kernels like DPO, ORPO, SimPO, and more which can reduce memory usage by up to 80%. You can easily use them as python modules.
105
106
 
106
107
  ```python
107
- from liger_kernel.chunked_loss import LigerFusedLinearDPOLoss
108
+ from liger_kernel.chunked_loss import LigerFusedLinearORPOLoss
108
109
  orpo_loss = LigerFusedLinearORPOLoss()
109
110
  y = orpo_loss(lm_head.weight, x, target)
110
111
  ```
@@ -127,7 +128,7 @@ y = orpo_loss(lm_head.weight, x, target)
127
128
  - **Exact:** Computation is exact—no approximations! Both forward and backward passes are implemented with rigorous unit tests and undergo convergence testing against training runs without Liger Kernel to ensure accuracy.
128
129
  - **Lightweight:** Liger Kernel has minimal dependencies, requiring only Torch and Triton—no extra libraries needed! Say goodbye to dependency headaches!
129
130
  - **Multi-GPU supported:** Compatible with multi-GPU setups (PyTorch FSDP, DeepSpeed, DDP, etc.).
130
- - **Trainer Framework Integration**: [Axolotl](https://github.com/axolotl-ai-cloud/axolotl), [LLaMa-Factory](https://github.com/hiyouga/LLaMA-Factory), [SFTTrainer](https://github.com/huggingface/trl/releases/tag/v0.10.1), [Hugging Face Trainer](https://github.com/huggingface/transformers/pull/32860), [SWIFT](https://github.com/modelscope/ms-swift)
131
+ - **Trainer Framework Integration**: [Axolotl](https://github.com/axolotl-ai-cloud/axolotl), [LLaMa-Factory](https://github.com/hiyouga/LLaMA-Factory), [SFTTrainer](https://github.com/huggingface/trl/releases/tag/v0.10.1), [Hugging Face Trainer](https://github.com/huggingface/transformers/pull/32860), [SWIFT](https://github.com/modelscope/ms-swift), [oumi](https://github.com/oumi-ai/oumi/tree/main)
131
132
 
132
133
  ## Installation
133
134
 
@@ -262,8 +263,10 @@ loss.backward()
262
263
  | Mixtral | `liger_kernel.transformers.apply_liger_kernel_to_mixtral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
263
264
  | Gemma1 | `liger_kernel.transformers.apply_liger_kernel_to_gemma` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
264
265
  | Gemma2 | `liger_kernel.transformers.apply_liger_kernel_to_gemma2` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
266
+ | Paligemma, Paligemma2, & Paligemma2 Mix | `liger_kernel.transformers.apply_liger_kernel_to_paligemma` | LayerNorm, RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
265
267
  | Qwen2, Qwen2.5, & QwQ | `liger_kernel.transformers.apply_liger_kernel_to_qwen2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
266
268
  | Qwen2-VL, & QVQ | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl` | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
269
+ | Qwen2.5-VL | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_5_vl` | RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
267
270
  | Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
268
271
  | Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
269
272
  | OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -335,8 +338,8 @@ loss.backward()
335
338
  ## Contact
336
339
 
337
340
  - For issues, create a Github ticket in this repository
338
- - For open discussion, join [our discord channel](https://discord.gg/gpumode)
339
- - For formal collaboration, send an email to yannchen@linkedin.com
341
+ - For open discussion, join [our discord channel on GPUMode](https://discord.com/channels/1189498204333543425/1275130785933951039)
342
+ - For formal collaboration, send an email to yannchen@linkedin.com and hning@linkedin.com
340
343
 
341
344
  ## Cite this work
342
345
 
@@ -355,7 +358,7 @@ Biblatex entry:
355
358
  ```
356
359
 
357
360
  ## Star History
358
- [![Star History Chart](https://api.star-history.com/svg?repos=linkedin/Liger-Kernel&type=Date)](https://star-history.com/#linkedin/Liger-Kernel&Date)
361
+ [![Star History Chart](https://api.star-history.com/svg?repos=linkedin/Liger-Kernel&type=Date)](https://www.star-history.com/#linkedin/Liger-Kernel&Date)
359
362
 
360
363
  <p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
361
364
  <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
@@ -8,8 +8,8 @@ import matplotlib.pyplot as plt
8
8
  import pandas as pd
9
9
  import seaborn as sns
10
10
 
11
- DATA_PATH = "data/all_benchmark_data.csv"
12
- VISUALIZATIONS_PATH = "visualizations/"
11
+ DATA_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "data/all_benchmark_data.csv"))
12
+ VISUALIZATIONS_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "visualizations/"))
13
13
 
14
14
 
15
15
  @dataclass
@@ -751,36 +751,6 @@ fused_linear_simpo_loss,huggingface,full,memory,MB,B,B,2,8645.314453125,8645.314
751
751
  fused_linear_simpo_loss,huggingface,full,memory,MB,B,B,4,12184.330078125,12184.330078125,12184.330078125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:30:01,0.4.1
752
752
  fused_linear_simpo_loss,huggingface,full,memory,MB,B,B,8,19262.361328125,19262.361328125,19262.361328125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:30:01,0.4.1
753
753
  fused_linear_simpo_loss,huggingface,full,memory,MB,B,B,16,33418.42578125,33418.42578125,33418.42578125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:30:01,0.4.1
754
- kto_loss,liger,forward,speed,ms,B,Batch Size (B),2,7.841599941253662,7.801983833312988,7.849664211273193,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:06,0.5.2
755
- kto_loss,liger,forward,speed,ms,B,Batch Size (B),4,15.568096160888672,15.555737495422363,16.054176330566406,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:06,0.5.2
756
- kto_loss,liger,forward,speed,ms,B,Batch Size (B),8,31.145376205444336,30.750951766967773,31.5398006439209,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:06,0.5.2
757
- kto_loss,liger,forward,speed,ms,B,Batch Size (B),16,61.49708938598633,61.49708938598633,61.49708938598633,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:06,0.5.2
758
- kto_loss,liger,forward,speed,ms,B,Batch Size (B),32,122.01449584960938,122.01449584960938,122.01449584960938,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:06,0.5.2
759
- kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),2,7.892335891723633,7.8687615394592285,8.03729248046875,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:28,0.5.2
760
- kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),4,14.16302490234375,13.813311576843262,15.860223770141602,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:28,0.5.2
761
- kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),8,25.56470489501953,25.564167022705078,25.641658782958984,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:28,0.5.2
762
- kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),16,53.0928955078125,53.0928955078125,53.0928955078125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:28,0.5.2
763
- kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),32,108.76080322265625,108.76080322265625,108.76080322265625,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:28,0.5.2
764
- kto_loss,liger,full,speed,ms,B,Batch Size (B),2,8.662687301635742,8.488287925720215,9.611334800720215,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:50,0.5.2
765
- kto_loss,liger,full,speed,ms,B,Batch Size (B),4,18.40096092224121,17.99224281311035,18.57883644104004,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:50,0.5.2
766
- kto_loss,liger,full,speed,ms,B,Batch Size (B),8,32.09159851074219,31.708070755004883,32.475128173828125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:50,0.5.2
767
- kto_loss,liger,full,speed,ms,B,Batch Size (B),16,69.30239868164062,69.30239868164062,69.30239868164062,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:50,0.5.2
768
- kto_loss,liger,full,speed,ms,B,Batch Size (B),32,124.2437744140625,124.2437744140625,124.2437744140625,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:50,0.5.2
769
- kto_loss,huggingface,full,speed,ms,B,Batch Size (B),2,11.449472427368164,11.407564163208008,11.773555755615234,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:11,0.5.2
770
- kto_loss,huggingface,full,speed,ms,B,Batch Size (B),4,20.871471405029297,20.862951278686523,20.879276275634766,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:11,0.5.2
771
- kto_loss,huggingface,full,speed,ms,B,Batch Size (B),8,41.16409683227539,40.760780334472656,41.567413330078125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:11,0.5.2
772
- kto_loss,huggingface,full,speed,ms,B,Batch Size (B),16,77.720703125,77.720703125,77.720703125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:11,0.5.2
773
- kto_loss,huggingface,full,speed,ms,B,Batch Size (B),32,156.25794982910156,156.25794982910156,156.25794982910156,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:11,0.5.2
774
- kto_loss,liger,full,memory,MB,B,Batch Size (B),2,2027.48583984375,2027.48583984375,2027.48583984375,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:36,0.5.2
775
- kto_loss,liger,full,memory,MB,B,Batch Size (B),4,2789.736328125,2789.736328125,2789.736328125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:36,0.5.2
776
- kto_loss,liger,full,memory,MB,B,Batch Size (B),8,2801.751953125,2801.751953125,2801.751953125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:36,0.5.2
777
- kto_loss,liger,full,memory,MB,B,Batch Size (B),16,2825.783203125,2825.783203125,2825.783203125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:36,0.5.2
778
- kto_loss,liger,full,memory,MB,B,Batch Size (B),32,2873.845703125,2873.845703125,2873.845703125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:36,0.5.2
779
- kto_loss,huggingface,full,memory,MB,B,Batch Size (B),2,3786.7373046875,3786.7373046875,3786.7373046875,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:59,0.5.2
780
- kto_loss,huggingface,full,memory,MB,B,Batch Size (B),4,5544.25390625,5544.25390625,5544.25390625,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:59,0.5.2
781
- kto_loss,huggingface,full,memory,MB,B,Batch Size (B),8,9057.287109375,9057.287109375,9057.287109375,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:59,0.5.2
782
- kto_loss,huggingface,full,memory,MB,B,Batch Size (B),16,16087.353515625,16087.353515625,16087.353515625,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:59,0.5.2
783
- kto_loss,huggingface,full,memory,MB,B,Batch Size (B),32,30147.486328125,30147.486328125,30147.486328125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:59,0.5.2
784
754
  distill_jsd_loss,liger,forward,speed,ms,BT,B x T,1024,7.735536098480225,7.729177474975586,7.798131465911865,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:58:46,0.4.2
785
755
  distill_jsd_loss,liger,forward,speed,ms,BT,B x T,2048,15.20411205291748,15.165056228637695,15.226079940795898,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:58:46,0.4.2
786
756
  distill_jsd_loss,liger,forward,speed,ms,BT,B x T,4096,30.159456253051758,30.126911163330078,30.165311813354492,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:58:46,0.4.2
@@ -805,4 +775,33 @@ distill_jsd_loss,torch,full,memory,MB,BT,B x T,1024,16174.0390625,16174.0390625,
805
775
  distill_jsd_loss,torch,full,memory,MB,BT,B x T,2048,23713.05078125,23713.05078125,23713.05078125,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:01:32,0.4.2
806
776
  distill_jsd_loss,torch,full,memory,MB,BT,B x T,4096,38791.07421875,38791.07421875,38791.07421875,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:01:32,0.4.2
807
777
  distill_jsd_loss,torch,full,memory,MB,BT,B x T,8192,68947.1015625,68947.1015625,68947.1015625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:01:32,0.4.2
808
-
778
+ kto_loss,liger,forward,speed,ms,B,Batch Size (B),2,3.9951679706573486,3.991487979888916,4.002252578735352,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:22:44,0.5.4
779
+ kto_loss,liger,forward,speed,ms,B,Batch Size (B),4,7.8037919998168945,7.788575649261475,7.808595180511475,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:22:44,0.5.4
780
+ kto_loss,liger,forward,speed,ms,B,Batch Size (B),8,15.43172836303711,15.430015563964844,15.4335355758667,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:22:44,0.5.4
781
+ kto_loss,liger,forward,speed,ms,B,Batch Size (B),16,30.66864013671875,30.66431999206543,30.670501708984375,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:22:44,0.5.4
782
+ kto_loss,liger,forward,speed,ms,B,Batch Size (B),32,61.1163215637207,61.1163215637207,61.1163215637207,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:22:44,0.5.4
783
+ kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),2,3.8766400814056396,3.8680384159088135,3.8897151947021484,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:01,0.5.4
784
+ kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),4,7.213727951049805,7.206470489501953,7.229574680328369,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:01,0.5.4
785
+ kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),8,13.828800201416016,13.810944557189941,13.834943771362305,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:01,0.5.4
786
+ kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),16,27.0930233001709,27.08517074584961,27.09713363647461,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:01,0.5.4
787
+ kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),32,54.13715362548828,54.13715362548828,54.13715362548828,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:01,0.5.4
788
+ kto_loss,liger,full,speed,ms,B,Batch Size (B),2,4.782928466796875,4.677459239959717,5.3430914878845215,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:18,0.5.4
789
+ kto_loss,liger,full,speed,ms,B,Batch Size (B),4,8.517248153686523,8.481344223022461,8.561504364013672,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:18,0.5.4
790
+ kto_loss,liger,full,speed,ms,B,Batch Size (B),8,16.547504425048828,16.513471603393555,16.678144454956055,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:18,0.5.4
791
+ kto_loss,liger,full,speed,ms,B,Batch Size (B),16,31.891263961791992,31.819705963134766,32.274131774902344,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:18,0.5.4
792
+ kto_loss,liger,full,speed,ms,B,Batch Size (B),32,62.953758239746094,62.953758239746094,62.953758239746094,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:18,0.5.4
793
+ kto_loss,huggingface,full,speed,ms,B,Batch Size (B),2,6.201632022857666,6.163315296173096,6.314668655395508,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:35,0.5.4
794
+ kto_loss,huggingface,full,speed,ms,B,Batch Size (B),4,11.156224250793457,11.142304420471191,11.207296371459961,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:35,0.5.4
795
+ kto_loss,huggingface,full,speed,ms,B,Batch Size (B),8,21.249855041503906,21.231891632080078,21.264543533325195,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:35,0.5.4
796
+ kto_loss,huggingface,full,speed,ms,B,Batch Size (B),16,41.55686569213867,41.536956787109375,41.57677459716797,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:35,0.5.4
797
+ kto_loss,huggingface,full,speed,ms,B,Batch Size (B),32,81.56924438476562,81.56924438476562,81.56924438476562,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:35,0.5.4
798
+ kto_loss,liger,full,memory,MB,B,Batch Size (B),2,2585.73876953125,2585.73876953125,2585.73876953125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:55,0.5.4
799
+ kto_loss,liger,full,memory,MB,B,Batch Size (B),4,3348.9892578125,3348.9892578125,3348.9892578125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:55,0.5.4
800
+ kto_loss,liger,full,memory,MB,B,Batch Size (B),8,3361.0048828125,3361.0048828125,3361.0048828125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:55,0.5.4
801
+ kto_loss,liger,full,memory,MB,B,Batch Size (B),16,3385.0361328125,3385.0361328125,3385.0361328125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:55,0.5.4
802
+ kto_loss,liger,full,memory,MB,B,Batch Size (B),32,3433.0986328125,3433.0986328125,3433.0986328125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:55,0.5.4
803
+ kto_loss,huggingface,full,memory,MB,B,Batch Size (B),2,4341.74951171875,4341.74951171875,4341.74951171875,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:24:11,0.5.4
804
+ kto_loss,huggingface,full,memory,MB,B,Batch Size (B),4,6099.26513671875,6099.26513671875,6099.26513671875,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:24:11,0.5.4
805
+ kto_loss,huggingface,full,memory,MB,B,Batch Size (B),8,9613.298828125,9613.298828125,9613.298828125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:24:11,0.5.4
806
+ kto_loss,huggingface,full,memory,MB,B,Batch Size (B),16,16643.365234375,16643.365234375,16643.365234375,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:24:11,0.5.4
807
+ kto_loss,huggingface,full,memory,MB,B,Batch Size (B),32,30703.498046875,30703.498046875,30703.498046875,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:24:11,0.5.4
@@ -81,6 +81,8 @@ class LigerJSDLoss(torch.nn.Module):
81
81
  teacher,
82
82
  self.teacher_lin.weight,
83
83
  target,
84
+ self.student_lin.bias,
85
+ self.teacher_lin.bias,
84
86
  self.weight_hard_loss,
85
87
  self.weight_soft_loss,
86
88
  )
@@ -0,0 +1,139 @@
1
+ import os
2
+ import sys
3
+
4
+ import torch
5
+ import triton
6
+
7
+ from utils import QUANTILES
8
+ from utils import SingleBenchmarkRunInput
9
+ from utils import SingleBenchmarkRunOutput
10
+ from utils import _test_memory
11
+ from utils import parse_benchmark_script_args
12
+ from utils import run_benchmarks
13
+
14
+ from liger_kernel.utils import infer_device
15
+
16
+ device = infer_device()
17
+
18
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
19
+
20
+
21
+ def bench_speed_dyt(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
22
+ from test.transformers.test_dyt import LigerDyT
23
+ from test.transformers.test_dyt import TorchDyT
24
+
25
+ BT = input.x
26
+ provider = input.kernel_provider
27
+ mode = input.kernel_operation_mode
28
+ extra_benchmark_config = input.extra_benchmark_config
29
+ hidden_size = extra_benchmark_config["hidden_size"]
30
+ dtype = extra_benchmark_config["dtype"]
31
+
32
+ x_shape = (BT, hidden_size)
33
+ torch_dyt = TorchDyT(hidden_size=hidden_size).to(device)
34
+ torch_compile_dyt = torch.compile(TorchDyT(hidden_size=hidden_size).to(device))
35
+ triton_dyt = LigerDyT(hidden_size=hidden_size).to(device)
36
+
37
+ x = torch.randn(x_shape, dtype=dtype, device=device)
38
+ dy = torch.randn_like(x)
39
+ x.requires_grad_(True)
40
+
41
+ def fwd():
42
+ if provider == "liger":
43
+ return triton_dyt(x)
44
+ elif provider == "torch":
45
+ return torch_dyt(x)
46
+ elif provider == "torch_compile":
47
+ return torch_compile_dyt(x)
48
+
49
+ if mode == "forward":
50
+ ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, quantiles=QUANTILES, grad_to_none=[x], rep=500)
51
+ elif mode == "backward":
52
+ y = fwd()
53
+ ms_50, ms_20, ms_80 = triton.testing.do_bench(
54
+ lambda: y.backward(dy, retain_graph=True),
55
+ quantiles=QUANTILES,
56
+ grad_to_none=[x],
57
+ rep=500,
58
+ )
59
+ elif mode == "full":
60
+
61
+ def full():
62
+ y = fwd()
63
+ y.backward(dy)
64
+
65
+ ms_50, ms_20, ms_80 = triton.testing.do_bench(full, quantiles=QUANTILES, grad_to_none=[x], rep=500)
66
+
67
+ return SingleBenchmarkRunOutput(
68
+ y_20=ms_20,
69
+ y_50=ms_50,
70
+ y_80=ms_80,
71
+ )
72
+
73
+
74
+ def bench_memory_dyt(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
75
+ from test.transformers.test_dyt import LigerDyT
76
+ from test.transformers.test_dyt import TorchDyT
77
+
78
+ BT = input.x
79
+ provider = input.kernel_provider
80
+ extra_benchmark_config = input.extra_benchmark_config
81
+ hidden_size = extra_benchmark_config["hidden_size"]
82
+ dtype = extra_benchmark_config["dtype"]
83
+
84
+ x_shape = (BT, hidden_size)
85
+ torch_dyt = TorchDyT(hidden_size=hidden_size).to(device)
86
+ torch_compile_dyt = torch.compile(TorchDyT(hidden_size=hidden_size).to(device))
87
+ triton_dyt = LigerDyT(hidden_size=hidden_size).to(device)
88
+
89
+ x = torch.randn(x_shape, dtype=dtype, device=device)
90
+ dy = torch.randn_like(x)
91
+ x.requires_grad_(True)
92
+
93
+ def fwd():
94
+ if provider == "liger":
95
+ return triton_dyt(x)
96
+ elif provider == "torch":
97
+ return torch_dyt(x)
98
+ elif provider == "torch_compile":
99
+ return torch_compile_dyt(x)
100
+
101
+ def full():
102
+ y = fwd()
103
+ y.backward(dy, retain_graph=True)
104
+
105
+ mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
106
+ return SingleBenchmarkRunOutput(
107
+ y_20=mem_20,
108
+ y_50=mem_50,
109
+ y_80=mem_80,
110
+ )
111
+
112
+
113
+ if __name__ == "__main__":
114
+ args = parse_benchmark_script_args()
115
+
116
+ common_configs = {
117
+ "kernel_name": "dyt",
118
+ "x_name": "BT",
119
+ "x_label": "batch_size * seq_len",
120
+ "x_values": [2**i for i in range(10, 15)],
121
+ "kernel_providers": ["liger", "torch", "torch_compile"],
122
+ "extra_benchmark_configs": [{"hidden_size": 4096, "dtype": torch.float32}],
123
+ "overwrite": args.overwrite,
124
+ }
125
+
126
+ run_benchmarks(
127
+ bench_test_fn=bench_speed_dyt,
128
+ kernel_operation_modes=["forward", "backward", "full"],
129
+ metric_name="speed",
130
+ metric_unit="ms",
131
+ **common_configs,
132
+ )
133
+ run_benchmarks(
134
+ bench_test_fn=bench_memory_dyt,
135
+ kernel_operation_modes=["full"],
136
+ metric_name="memory",
137
+ metric_unit="MB",
138
+ **common_configs,
139
+ )
@@ -149,7 +149,7 @@ def bench_memory_kto_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunO
149
149
  y=target,
150
150
  preference_labels=preference_labels,
151
151
  kl=kl,
152
- )
152
+ )[0]
153
153
  elif provider == "huggingface":
154
154
  return torch_kto_loss(
155
155
  x=_input,
@@ -157,7 +157,7 @@ def bench_memory_kto_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunO
157
157
  y=target,
158
158
  preference_labels=preference_labels,
159
159
  kl=kl,
160
- )
160
+ )[0]
161
161
 
162
162
  def full():
163
163
  y = fwd()
@@ -230,7 +230,7 @@ def bench_speed_kto_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOu
230
230
  y=target,
231
231
  preference_labels=preference_labels,
232
232
  kl=kl,
233
- )
233
+ )[0]
234
234
  elif provider == "huggingface":
235
235
  return torch_kto_loss(
236
236
  x=_input,
@@ -238,7 +238,7 @@ def bench_speed_kto_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOu
238
238
  y=target,
239
239
  preference_labels=preference_labels,
240
240
  kl=kl,
241
- )
241
+ )[0]
242
242
 
243
243
  if mode == "forward":
244
244
  ms_50, ms_20, ms_80 = triton.testing.do_bench(
@@ -14,7 +14,7 @@ app = modal.App("liger_tests", image=image)
14
14
  repo = modal.Mount.from_local_dir(ROOT_PATH, remote_path=REMOTE_ROOT_PATH)
15
15
 
16
16
 
17
- @app.function(gpu="A10G", mounts=[repo], timeout=60 * 20)
17
+ @app.function(gpu="A10G", mounts=[repo], timeout=60 * 30)
18
18
  def liger_tests():
19
19
  import subprocess
20
20
 
@@ -14,7 +14,7 @@ app = modal.App("liger_tests_bwd", image=image)
14
14
  repo = modal.Mount.from_local_dir(ROOT_PATH, remote_path=REMOTE_ROOT_PATH)
15
15
 
16
16
 
17
- @app.function(gpu="A10G", mounts=[repo], timeout=60 * 15)
17
+ @app.function(gpu="A10G", mounts=[repo], timeout=60 * 30)
18
18
  def liger_bwd_tests():
19
19
  import subprocess
20
20
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "liger_kernel"
7
- version = "0.5.4"
7
+ version = "0.5.6"
8
8
  description = "Efficient Triton kernels for LLM Training"
9
9
  urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -39,8 +39,9 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
39
39
 
40
40
  return loss, chosen_rewards, rejected_rewards
41
41
 
42
- @staticmethod
42
+ @classmethod
43
43
  def forward(
44
+ cls,
44
45
  ctx,
45
46
  _input,
46
47
  weight,
@@ -52,27 +53,48 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
52
53
  label_smoothing=0.0,
53
54
  compute_nll_loss=True,
54
55
  compiled=True,
56
+ average_log_prob=False,
57
+ chunk_size=1,
55
58
  ):
56
- return LigerFusedLinearPreferenceBase.forward(
57
- ctx,
58
- _input,
59
- weight,
60
- target,
61
- bias,
62
- loss_fn=LigerFusedLinearCPOFunction.preference_loss_fn,
59
+ """
60
+ Fused linear layer with CPO loss.
61
+ Args:
62
+ _input (torch.Tensor): Input tensor. Shape: (batch_size * seq_len, hidden_size)
63
+ weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size)
64
+ target (torch.LongTensor): Target tensor. Shape: (batch_size * seq_len,)
65
+ bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,)
66
+ ignore_index (int): Index to ignore in loss computation
67
+ beta (float): Weight for the odds ratio loss
68
+ alpha (float): Weight for the alpha parameter
69
+ label_smoothing (float): Label smoothing factor
70
+ compute_nll_loss (bool): Whether to compute the NLL loss
71
+ compiled (bool): Whether to use torch compile
72
+ average_log_prob (bool): Whether to average the log probability per non-masked token
73
+ chunk_size (int): Size of chunks for processing.
74
+ Returns:
75
+ torch.Tensor: Computed loss
76
+ """
77
+ return super().forward(
78
+ cls=cls,
79
+ ctx=ctx,
80
+ _input=_input,
81
+ weight=weight,
82
+ target=target,
83
+ bias=bias,
63
84
  ignore_index=ignore_index,
64
85
  alpha=alpha,
65
86
  beta=beta,
66
87
  label_smoothing=label_smoothing,
67
88
  compute_nll_loss=compute_nll_loss,
68
- average_log_prob=False,
89
+ average_log_prob=average_log_prob,
69
90
  compiled=compiled,
91
+ chunk_size=chunk_size,
70
92
  )
71
93
 
72
94
  @staticmethod
73
95
  def backward(ctx, *grad_output):
74
96
  grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
75
- return *grads, None, None, None, None, None, None
97
+ return *grads, None, None, None, None, None, None, None, None
76
98
 
77
99
 
78
100
  class LigerFusedLinearCPOLoss(torch.nn.Module):
@@ -88,11 +110,19 @@ class LigerFusedLinearCPOLoss(torch.nn.Module):
88
110
  label_smoothing: float = 0.0,
89
111
  compute_nll_loss: bool = True,
90
112
  compiled: bool = True,
113
+ average_log_prob: bool = False,
114
+ chunk_size: int = 1,
91
115
  ):
92
116
  """
93
117
  Args:
94
118
  ignore_index (int): Index to ignore in the loss.
95
119
  beta (float): Weight for the odds ratio loss.
120
+ alpha (float): Weight for the alpha parameter.
121
+ label_smoothing (float): Label smoothing factor.
122
+ compute_nll_loss (bool): Whether to compute the NLL loss.
123
+ compiled (bool): Whether to use the torch compiled kernel.
124
+ average_log_prob (bool): Whether to average the log probability per non-masked token.
125
+ chunk_size (int): Size of chunks for processing.
96
126
  """
97
127
  super().__init__()
98
128
  self.ignore_index = ignore_index
@@ -101,8 +131,16 @@ class LigerFusedLinearCPOLoss(torch.nn.Module):
101
131
  self.label_smoothing = label_smoothing
102
132
  self.compute_nll_loss = compute_nll_loss
103
133
  self.compiled = compiled
134
+ self.average_log_prob = average_log_prob
135
+ self.chunk_size = chunk_size
104
136
 
105
- def forward(self, lin_weight, _input, target, bias=None):
137
+ def forward(
138
+ self,
139
+ lin_weight,
140
+ _input,
141
+ target,
142
+ bias=None,
143
+ ):
106
144
  return LigerFusedLinearCPOFunction.apply(
107
145
  _input,
108
146
  lin_weight,
@@ -114,4 +152,6 @@ class LigerFusedLinearCPOLoss(torch.nn.Module):
114
152
  self.label_smoothing,
115
153
  self.compute_nll_loss,
116
154
  self.compiled,
155
+ self.average_log_prob,
156
+ self.chunk_size,
117
157
  )