liger-kernel-nightly 0.6.0.dev20250718080702__tar.gz → 0.6.0.dev20250719041256__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/PKG-INFO +1 -1
  2. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/data/all_benchmark_data.csv +41 -31
  3. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/pyproject.toml +1 -1
  4. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/layer_norm.py +126 -88
  5. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel_nightly.egg-info/PKG-INFO +1 -1
  6. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_layer_norm.py +3 -0
  7. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_monkey_patch.py +1 -0
  8. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  9. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  10. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/pull_request_template.md +0 -0
  11. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/workflows/amd-ci.yml +0 -0
  12. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/workflows/benchmark.yml +0 -0
  13. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/workflows/docs.yml +0 -0
  14. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/workflows/intel-ci.yml +0 -0
  15. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/workflows/nvi-ci.yml +0 -0
  16. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/workflows/publish-nightly.yml +0 -0
  17. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/workflows/publish-release.yml +0 -0
  18. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/.gitignore +0 -0
  19. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/LICENSE +0 -0
  20. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/Makefile +0 -0
  21. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/NOTICE +0 -0
  22. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/README.md +0 -0
  23. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/README.md +0 -0
  24. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/__init__.py +0 -0
  25. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/benchmarks_visualizer.py +0 -0
  26. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/__init__.py +0 -0
  27. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
  28. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
  29. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
  30. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
  31. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
  32. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_dyt.py +0 -0
  33. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_embedding.py +0 -0
  34. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_fused_add_rms_norm.py +0 -0
  35. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
  36. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
  37. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
  38. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_geglu.py +0 -0
  39. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_group_norm.py +0 -0
  40. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_jsd.py +0 -0
  41. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_kl_div.py +0 -0
  42. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_kto_loss.py +0 -0
  43. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_layer_norm.py +0 -0
  44. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
  45. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
  46. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
  47. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_rms_norm.py +0 -0
  48. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_rope.py +0 -0
  49. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
  50. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_softmax.py +0 -0
  51. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
  52. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_sparsemax.py +0 -0
  53. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_swiglu.py +0 -0
  54. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_tvd.py +0 -0
  55. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/utils.py +0 -0
  56. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/dev/fmt-requirements.txt +0 -0
  57. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/dev/modal/benchmarks.py +0 -0
  58. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/dev/modal/tests.py +0 -0
  59. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/dev/modal/tests_bwd.py +0 -0
  60. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/Examples.md +0 -0
  61. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/Getting-Started.md +0 -0
  62. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/High-Level-APIs.md +0 -0
  63. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/Low-Level-APIs.md +0 -0
  64. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/acknowledgement.md +0 -0
  65. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/contributing.md +0 -0
  66. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/images/banner.GIF +0 -0
  67. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/images/compose.gif +0 -0
  68. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/images/e2e-memory.png +0 -0
  69. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/images/e2e-tps.png +0 -0
  70. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/images/logo-banner.png +0 -0
  71. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/images/patch.gif +0 -0
  72. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/images/post-training.png +0 -0
  73. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/index.md +0 -0
  74. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/license.md +0 -0
  75. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/alignment/accelerate_config.yaml +0 -0
  76. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/alignment/run_orpo.py +0 -0
  77. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/README.md +0 -0
  78. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/callback.py +0 -0
  79. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/config/fsdp_config.json +0 -0
  80. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/img/gemma_7b_mem.png +0 -0
  81. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/img/gemma_7b_tp.png +0 -0
  82. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/img/llama_mem_alloc.png +0 -0
  83. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/img/llama_tps.png +0 -0
  84. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
  85. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/img/qwen_tps.png +0 -0
  86. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/launch_on_modal.py +0 -0
  87. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/requirements.txt +0 -0
  88. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/run_benchmarks.sh +0 -0
  89. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/run_gemma.sh +0 -0
  90. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/run_llama.sh +0 -0
  91. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/run_qwen.sh +0 -0
  92. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/run_qwen2_vl.sh +0 -0
  93. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/training.py +0 -0
  94. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/training_multimodal.py +0 -0
  95. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/lightning/README.md +0 -0
  96. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/lightning/requirements.txt +0 -0
  97. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/lightning/training.py +0 -0
  98. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/README.md +0 -0
  99. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/callback.py +0 -0
  100. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
  101. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
  102. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
  103. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
  104. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
  105. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
  106. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
  107. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
  108. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
  109. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/medusa_util.py +0 -0
  110. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/requirements.txt +0 -0
  111. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
  112. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/train.py +0 -0
  113. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/licenses/LICENSE-Apache-2.0 +0 -0
  114. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/licenses/LICENSE-MIT-AutoAWQ +0 -0
  115. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
  116. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/licenses/LICENSE-MIT-llmc +0 -0
  117. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/licenses/LICENSE-MIT-triton +0 -0
  118. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/mkdocs.yml +0 -0
  119. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/setup.cfg +0 -0
  120. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/setup.py +0 -0
  121. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/__init__.py +0 -0
  122. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/README.md +0 -0
  123. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/__init__.py +0 -0
  124. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
  125. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
  126. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
  127. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/functional.py +0 -0
  128. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
  129. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
  130. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
  131. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
  132. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
  133. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
  134. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
  135. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
  136. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
  137. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/env_report.py +0 -0
  138. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/__init__.py +0 -0
  139. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/cross_entropy.py +0 -0
  140. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/dyt.py +0 -0
  141. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  142. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
  143. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/fused_add_rms_norm.py +0 -0
  144. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
  145. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
  146. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
  147. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/geglu.py +0 -0
  148. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/group_norm.py +0 -0
  149. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/grpo_loss.py +0 -0
  150. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/jsd.py +0 -0
  151. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/kl_div.py +0 -0
  152. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/multi_token_attention.py +0 -0
  153. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
  154. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/rms_norm.py +0 -0
  155. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/rope.py +0 -0
  156. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/softmax.py +0 -0
  157. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/sparsemax.py +0 -0
  158. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/swiglu.py +0 -0
  159. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/tvd.py +0 -0
  160. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/utils.py +0 -0
  161. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/__init__.py +0 -0
  162. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/auto_model.py +0 -0
  163. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/cross_entropy.py +0 -0
  164. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/dyt.py +0 -0
  165. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  166. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/fsdp.py +0 -0
  167. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/functional.py +0 -0
  168. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/fused_add_rms_norm.py +0 -0
  169. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
  170. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
  171. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
  172. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/geglu.py +0 -0
  173. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/group_norm.py +0 -0
  174. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/grpo_loss.py +0 -0
  175. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/jsd.py +0 -0
  176. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/kl_div.py +0 -0
  177. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/layer_norm.py +0 -0
  178. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/__init__.py +0 -0
  179. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/gemma.py +0 -0
  180. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/gemma2.py +0 -0
  181. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/gemma3.py +0 -0
  182. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/glm4.py +0 -0
  183. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/llama.py +0 -0
  184. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/llama4.py +0 -0
  185. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/llava.py +0 -0
  186. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
  187. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/mistral.py +0 -0
  188. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/mixtral.py +0 -0
  189. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/mllama.py +0 -0
  190. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/olmo2.py +0 -0
  191. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/paligemma.py +0 -0
  192. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/phi3.py +0 -0
  193. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/qwen2.py +0 -0
  194. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
  195. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
  196. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/qwen3.py +0 -0
  197. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
  198. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/smollm3.py +0 -0
  199. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/monkey_patch.py +0 -0
  200. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
  201. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
  202. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/rms_norm.py +0 -0
  203. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/rope.py +0 -0
  204. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/softmax.py +0 -0
  205. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/sparsemax.py +0 -0
  206. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/swiglu.py +0 -0
  207. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
  208. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
  209. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  210. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/tvd.py +0 -0
  211. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/triton/__init__.py +0 -0
  212. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/triton/monkey_patch.py +0 -0
  213. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/utils.py +0 -0
  214. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel_nightly.egg-info/SOURCES.txt +0 -0
  215. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
  216. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
  217. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
  218. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/__init__.py +0 -0
  219. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/__init__.py +0 -0
  220. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_cosine_loss.py +0 -0
  221. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_cpo_loss.py +0 -0
  222. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_dpo_loss.py +0 -0
  223. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_grpo_loss.py +0 -0
  224. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_jsd_loss.py +0 -0
  225. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_kto_loss.py +0 -0
  226. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_orpo_loss.py +0 -0
  227. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_simpo_loss.py +0 -0
  228. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/conftest.py +0 -0
  229. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/__init__.py +0 -0
  230. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/bf16/__init__.py +0 -0
  231. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/bf16/test_mini_models.py +0 -0
  232. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/bf16/test_mini_models_multimodal.py +0 -0
  233. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/bf16/test_mini_models_with_logits.py +0 -0
  234. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/fp32/__init__.py +0 -0
  235. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/fp32/test_mini_models.py +0 -0
  236. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
  237. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/fp32/test_mini_models_with_logits.py +0 -0
  238. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
  239. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
  240. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
  241. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
  242. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
  243. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
  244. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
  245. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
  246. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
  247. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
  248. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/tiny_shakespeare.txt +0 -0
  249. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
  250. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
  251. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
  252. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_auto_model.py +0 -0
  253. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_cross_entropy.py +0 -0
  254. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_dyt.py +0 -0
  255. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_embedding.py +0 -0
  256. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_flex_attention.py +0 -0
  257. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_fused_add_rms_norm.py +0 -0
  258. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
  259. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_fused_linear_jsd.py +0 -0
  260. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_fused_neighborhood_attention.py +0 -0
  261. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_geglu.py +0 -0
  262. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_group_norm.py +0 -0
  263. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_grpo_loss.py +0 -0
  264. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_jsd.py +0 -0
  265. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_kl_div.py +0 -0
  266. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_mm_int8int2.py +0 -0
  267. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_multi_token_attention.py +0 -0
  268. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_qwen2vl_mrope.py +0 -0
  269. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_rms_norm.py +0 -0
  270. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_rope.py +0 -0
  271. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_softmax.py +0 -0
  272. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_sparsemax.py +0 -0
  273. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_swiglu.py +0 -0
  274. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_trainer_integration.py +0 -0
  275. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_transformers.py +0 -0
  276. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_tvd.py +0 -0
  277. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/triton/test_triton_monkey_patch.py +0 -0
  278. {liger_kernel_nightly-0.6.0.dev20250718080702 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.6.0.dev20250718080702
3
+ Version: 0.6.0.dev20250719041256
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -625,36 +625,6 @@ group_norm,huggingface,backward,memory,MB,C,num_channels,256,320.5078125,320.507
625
625
  group_norm,huggingface,backward,memory,MB,C,num_channels,512,641.015625,641.015625,641.015625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
626
626
  group_norm,huggingface,backward,memory,MB,C,num_channels,1024,1282.03125,1282.03125,1282.03125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
627
627
  group_norm,huggingface,backward,memory,MB,C,num_channels,2048,2564.0625,2564.0625,2564.0625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
628
- layer_norm,liger,forward,speed,ms,N,hidden size,1024,0.035840000957250595,0.03481600061058998,0.035840000957250595,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
629
- layer_norm,liger,forward,speed,ms,N,hidden size,2048,0.05939200147986412,0.058368001133203506,0.060416001826524734,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
630
- layer_norm,liger,forward,speed,ms,N,hidden size,4096,0.10751999914646149,0.10751999914646149,0.1085439994931221,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
631
- layer_norm,liger,forward,speed,ms,N,hidden size,8192,0.20582400262355804,0.20479999482631683,0.20684799551963806,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
632
- layer_norm,liger,forward,speed,ms,N,hidden size,16384,0.3993600010871887,0.3983359932899475,0.40140798687934875,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
633
- layer_norm,huggingface,forward,speed,ms,N,hidden size,1024,0.03788800165057182,0.03788800165057182,0.03891199827194214,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
634
- layer_norm,huggingface,forward,speed,ms,N,hidden size,2048,0.0655359998345375,0.0655359998345375,0.06656000018119812,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
635
- layer_norm,huggingface,forward,speed,ms,N,hidden size,4096,0.14745600521564484,0.14643199741840363,0.14847999811172485,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
636
- layer_norm,huggingface,forward,speed,ms,N,hidden size,8192,0.31334400177001953,0.3123199939727783,0.31436800956726074,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
637
- layer_norm,huggingface,forward,speed,ms,N,hidden size,16384,0.6133760213851929,0.6123520135879517,0.6154239773750305,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
638
- layer_norm,liger,full,speed,ms,N,hidden size,1024,0.6860799789428711,0.6146048903465271,0.7049216032028198,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
639
- layer_norm,liger,full,speed,ms,N,hidden size,2048,0.6789119839668274,0.6737920045852661,0.6912000179290771,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
640
- layer_norm,liger,full,speed,ms,N,hidden size,4096,0.6686720252037048,0.6635519862174988,0.681984007358551,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
641
- layer_norm,liger,full,speed,ms,N,hidden size,8192,0.6789119839668274,0.5908480286598206,0.6932479739189148,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
642
- layer_norm,liger,full,speed,ms,N,hidden size,16384,6.071296215057373,5.331148624420166,6.08235502243042,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
643
- layer_norm,huggingface,full,speed,ms,N,hidden size,1024,0.13312000036239624,0.13209599256515503,0.13312000036239624,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
644
- layer_norm,huggingface,full,speed,ms,N,hidden size,2048,0.23244799673557281,0.2303999960422516,0.23347200453281403,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
645
- layer_norm,huggingface,full,speed,ms,N,hidden size,4096,0.5242879986763,0.5232639908790588,0.5263360142707825,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
646
- layer_norm,huggingface,full,speed,ms,N,hidden size,8192,1.0168319940567017,1.0147839784622192,1.018880009651184,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
647
- layer_norm,huggingface,full,speed,ms,N,hidden size,16384,1.994752049446106,1.9916800260543823,1.9967999458312988,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
648
- layer_norm,liger,full,memory,MB,N,hidden size,1024,80.90625,80.90625,80.90625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
649
- layer_norm,liger,full,memory,MB,N,hidden size,2048,161.78125,161.78125,161.78125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
650
- layer_norm,liger,full,memory,MB,N,hidden size,4096,323.53125,323.53125,323.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
651
- layer_norm,liger,full,memory,MB,N,hidden size,8192,647.03125,647.03125,647.03125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
652
- layer_norm,liger,full,memory,MB,N,hidden size,16384,1294.03125,1294.03125,1294.03125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
653
- layer_norm,huggingface,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
654
- layer_norm,huggingface,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
655
- layer_norm,huggingface,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
656
- layer_norm,huggingface,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
657
- layer_norm,huggingface,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
658
628
  fused_linear_orpo_loss,liger,forward,speed,ms,B,B,2,116.00621032714844,116.00621032714844,116.00621032714844,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
659
629
  fused_linear_orpo_loss,liger,forward,speed,ms,B,B,4,230.83609008789062,230.83609008789062,230.83609008789062,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
660
630
  fused_linear_orpo_loss,liger,forward,speed,ms,B,B,8,461.9543151855469,461.9543151855469,461.9543151855469,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
@@ -1493,6 +1463,46 @@ distill_cosine_loss,torch,full,memory,MB,BT,B x T,1024,7566.2822265625,7566.2822
1493
1463
  distill_cosine_loss,torch,full,memory,MB,BT,B x T,2048,11590.3134765625,11590.3134765625,11590.3134765625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
1494
1464
  distill_cosine_loss,torch,full,memory,MB,BT,B x T,4096,19654.375,19654.375,19654.375,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
1495
1465
  distill_cosine_loss,torch,full,memory,MB,BT,B x T,8192,35782.5,35782.5,35782.5,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
1466
+ layer_norm,liger,forward,speed,ms,N,hidden size,1024,0.018848000094294548,0.018400000408291817,0.020102400332689285,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1467
+ layer_norm,liger,forward,speed,ms,N,hidden size,2048,0.029152000322937965,0.02876799926161766,0.029823999851942062,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1468
+ layer_norm,liger,forward,speed,ms,N,hidden size,4096,0.05104000121355057,0.05036799982190132,0.05177599936723709,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1469
+ layer_norm,liger,forward,speed,ms,N,hidden size,8192,0.0947519987821579,0.09436800330877304,0.09507200121879578,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1470
+ layer_norm,liger,forward,speed,ms,N,hidden size,16384,0.18476800620555878,0.18396799266338348,0.1852159947156906,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
1471
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,1024,0.023584000766277313,0.023423999547958374,0.023840000852942467,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1472
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,2048,0.03734400123357773,0.03702399879693985,0.037811201065778746,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1473
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,4096,0.06617599725723267,0.06560000032186508,0.06678400188684464,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1474
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,8192,0.15267199277877808,0.15190400183200836,0.15347200632095337,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1475
+ layer_norm,huggingface,forward,speed,ms,N,hidden size,16384,0.3067840039730072,0.3046143889427185,0.3081152021884918,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
1476
+ layer_norm,liger,backward,speed,ms,N,hidden size,1024,0.12006399780511856,0.11653760075569153,0.12467200309038162,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1477
+ layer_norm,liger,backward,speed,ms,N,hidden size,2048,0.1207360029220581,0.1176128014922142,0.1256511986255646,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1478
+ layer_norm,liger,backward,speed,ms,N,hidden size,4096,0.16630400717258453,0.16412800550460815,0.16838400065898895,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1479
+ layer_norm,liger,backward,speed,ms,N,hidden size,8192,0.31279999017715454,0.31116798520088196,0.3145279884338379,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1480
+ layer_norm,liger,backward,speed,ms,N,hidden size,16384,0.5776320099830627,0.5753471970558167,0.5798912048339844,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
1481
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,1024,0.0605119988322258,0.059647999703884125,0.061344001442193985,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1482
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,2048,0.09967999905347824,0.09849599748849869,0.10099200159311295,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1483
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,4096,0.17881600558757782,0.17795200645923615,0.17971199750900269,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1484
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,8192,0.33369600772857666,0.3328000009059906,0.33478400111198425,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1485
+ layer_norm,huggingface,backward,speed,ms,N,hidden size,16384,0.6424000263214111,0.6412223815917969,0.643455982208252,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
1486
+ layer_norm,liger,full,speed,ms,N,hidden size,1024,0.26576000452041626,0.2629248082637787,0.2701759934425354,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1487
+ layer_norm,liger,full,speed,ms,N,hidden size,2048,0.27427199482917786,0.26999040842056277,0.28091518878936766,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1488
+ layer_norm,liger,full,speed,ms,N,hidden size,4096,0.27454400062561035,0.27004799246788025,0.2807359993457794,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1489
+ layer_norm,liger,full,speed,ms,N,hidden size,8192,0.40556800365448,0.40403199195861816,0.40723198652267456,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1490
+ layer_norm,liger,full,speed,ms,N,hidden size,16384,0.7608960270881653,0.7589311957359314,0.7631679773330688,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
1491
+ layer_norm,huggingface,full,speed,ms,N,hidden size,1024,0.08025600016117096,0.07942400127649307,0.08111999928951263,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1492
+ layer_norm,huggingface,full,speed,ms,N,hidden size,2048,0.13315199315547943,0.13180799782276154,0.13468800485134125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1493
+ layer_norm,huggingface,full,speed,ms,N,hidden size,4096,0.2417600005865097,0.24089600145816803,0.24262399971485138,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1494
+ layer_norm,huggingface,full,speed,ms,N,hidden size,8192,0.4832639992237091,0.48214399814605713,0.4843647956848145,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1495
+ layer_norm,huggingface,full,speed,ms,N,hidden size,16384,0.950575977563858,0.9484800100326538,0.9528064012527466,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1496
+ layer_norm,liger,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1497
+ layer_norm,liger,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1498
+ layer_norm,liger,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1499
+ layer_norm,liger,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1500
+ layer_norm,liger,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1501
+ layer_norm,huggingface,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1502
+ layer_norm,huggingface,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1503
+ layer_norm,huggingface,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1504
+ layer_norm,huggingface,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1505
+ layer_norm,huggingface,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
1496
1506
  fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,1024,0.01759999990463257,0.017311999574303627,0.017920000478625298,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
1497
1507
  fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,2048,0.02924799919128418,0.028863999992609024,0.029983999207615852,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
1498
1508
  fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,4096,0.05129599943757057,0.050624001771211624,0.05209600180387497,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
@@ -1564,4 +1574,4 @@ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,2048,208.06298828
1564
1574
  fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,4096,416.11767578125,416.11767578125,416.11767578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1565
1575
  fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,8192,832.22705078125,832.22705078125,832.22705078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1566
1576
  fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,16384,1544.44580078125,1544.44580078125,1544.44580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1567
- fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1577
+ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "liger_kernel_nightly"
7
- version = "0.6.0.dev20250718080702"
7
+ version = "0.6.0.dev20250719041256"
8
8
  description = "Efficient Triton kernels for LLM Training"
9
9
  urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -1,4 +1,3 @@
1
- import math
2
1
  import operator
3
2
 
4
3
  import torch
@@ -43,30 +42,45 @@ def _layer_norm_forward_kernel(
43
42
  https://arxiv.org/abs/1607.06450
44
43
  https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
45
44
  """
46
- row_idx = tl.program_id(0)
45
+ row_idx = tl.program_id(0).to(tl.int64)
47
46
  col_offsets = tl.arange(0, BLOCK_SIZE)
48
47
  mask = col_offsets < n_cols
49
48
 
50
- Y_ptr += row_idx * Y_row_stride
51
- X_ptr += row_idx * X_row_stride
52
- Mean_ptr += row_idx * Mean_row_stride
53
- RSTD_ptr += row_idx * RSTD_row_stride
54
-
55
- X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0)
56
- W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
57
- B_row = tl.load(B_ptr + col_offsets, mask=mask, other=0)
58
-
59
- mean = tl.sum(X_row, axis=0) / n_cols
60
- Xmm = tl.where(mask, X_row - mean, 0)
61
- var = tl.sum(Xmm * Xmm, axis=0) / n_cols
49
+ # Pre-load weights and bias in fp32 to avoid repeated conversions
50
+ W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
51
+ B_row = tl.load(B_ptr + col_offsets, mask=mask, other=0.0)
52
+ W_f32 = W_row.to(tl.float32)
53
+ B_f32 = B_row.to(tl.float32)
54
+
55
+ # Calculate pointers for this row
56
+ row_X_ptr = X_ptr + row_idx * X_row_stride
57
+ row_Y_ptr = Y_ptr + row_idx * Y_row_stride
58
+ row_Mean_ptr = Mean_ptr + row_idx * Mean_row_stride
59
+ row_RSTD_ptr = RSTD_ptr + row_idx * RSTD_row_stride
60
+
61
+ # Load input data and convert to fp32 for numerical stability
62
+ X_row = tl.load(row_X_ptr + col_offsets, mask=mask, other=0.0)
63
+ X_f32 = X_row.to(tl.float32)
64
+
65
+ # Compute statistics in fp32 for numerical stability
66
+ n_cols_f32 = n_cols.to(tl.float32)
67
+ mean = tl.sum(X_f32, axis=0) / n_cols_f32
68
+ X_centered = X_f32 - mean
69
+ # Apply mask to variance calculation to exclude contributions from masked elements
70
+ X_centered_masked = tl.where(mask, X_centered, 0.0)
71
+ var = tl.sum(X_centered_masked * X_centered_masked, axis=0) / n_cols_f32
62
72
  rstd = rsqrt(var + eps)
63
73
 
64
- tl.store(Mean_ptr, mean)
65
- tl.store(RSTD_ptr, rstd)
74
+ # Store statistics (convert back to original dtype only once)
75
+ tl.store(row_Mean_ptr, mean.to(X_row.dtype))
76
+ tl.store(row_RSTD_ptr, rstd.to(X_row.dtype))
66
77
 
67
- Y_row = Xmm * rstd * W_row + B_row
78
+ # Fused normalization and affine transformation
79
+ # Y = (X - mean) * rstd * W + B = X_centered * rstd * W + B
80
+ Y_f32 = X_centered * rstd * W_f32 + B_f32
68
81
 
69
- tl.store(Y_ptr + col_offsets, Y_row, mask=mask)
82
+ # Store output (single conversion back to original dtype)
83
+ tl.store(row_Y_ptr + col_offsets, Y_f32.to(X_row.dtype), mask=mask)
70
84
 
71
85
 
72
86
  @triton.jit
@@ -81,73 +95,87 @@ def _layer_norm_backward_kernel(
81
95
  DY_ptr, # pointer to output grad, shape (n_rows, n_cols)
82
96
  stride_x, # stride of each row in input
83
97
  stride_dx, # stride of each row in input grad
84
- stride_dw, # stride of each row in weights grad
85
- stride_db, # stride of each row in bias grad
86
98
  stride_dy, # stride of each row in output grad
87
- n_rows,
88
99
  n_cols,
89
- rows_per_program: tl.constexpr,
90
100
  BLOCK_SIZE: tl.constexpr,
91
101
  dtype: tl.constexpr,
102
+ atomic_dtype: tl.constexpr,
92
103
  ):
93
104
  """
94
105
  References:
95
106
  https://arxiv.org/abs/1607.06450
96
107
  https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
97
- https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
98
- https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/triton/layer_norm.py
99
108
  """
100
- row_block_id = tl.program_id(0)
101
- row_start = row_block_id * rows_per_program
102
- row_end = min((row_block_id + 1) * rows_per_program, n_rows)
109
+ row_idx = tl.program_id(0).to(tl.int64)
103
110
  cols = tl.arange(0, BLOCK_SIZE)
104
111
  mask = cols < n_cols
105
112
 
106
- dw_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
107
- db_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
108
-
109
- X_ptr += row_start * stride_x
110
- Mean_ptr += row_start
111
- RSTD_ptr += row_start
112
- DX_ptr += row_start * stride_dx
113
- DY_ptr += row_start * stride_dy
114
-
115
- for _ in range(row_start, row_end):
116
- x = tl.load(X_ptr + cols, mask=mask, other=0.0)
117
- w = tl.load(W_ptr + cols, mask=mask, other=0.0)
118
- dy = tl.load(DY_ptr + cols, mask=mask, other=0.0)
119
- mean = tl.load(Mean_ptr)
120
- rstd = tl.load(RSTD_ptr)
121
-
122
- x_hat = (x - mean) * rstd
123
- wdy = w * dy
124
- c1 = tl.sum(x_hat * wdy, axis=0) / n_cols
125
- c2 = tl.sum(wdy, axis=0) / n_cols
126
- dx = (wdy - (x_hat * c1 + c2)) * rstd
127
- tl.store(DX_ptr + cols, dx.to(dtype), mask=mask)
128
-
129
- dw_row += dy * x_hat
130
- db_row += dy
131
-
132
- X_ptr += stride_x
133
- Mean_ptr += 1
134
- RSTD_ptr += 1
135
- DX_ptr += stride_dx
136
- DY_ptr += stride_dy
137
-
138
- tl.store(DW_ptr + row_block_id * stride_dw + cols, dw_row.to(dtype), mask=mask)
139
- tl.store(DB_ptr + row_block_id * stride_db + cols, db_row.to(dtype), mask=mask)
113
+ # Pre-load weights once (same optimization as forward pass)
114
+ w = tl.load(W_ptr + cols, mask=mask, other=0.0)
115
+ w_f32 = w.to(tl.float32)
116
+ n_cols_f32 = n_cols.to(tl.float32)
117
+
118
+ # Calculate pointers for this specific row
119
+ row_X_ptr = X_ptr + row_idx * stride_x
120
+ row_DX_ptr = DX_ptr + row_idx * stride_dx
121
+ row_DY_ptr = DY_ptr + row_idx * stride_dy
122
+ row_Mean_ptr = Mean_ptr + row_idx
123
+ row_RSTD_ptr = RSTD_ptr + row_idx
124
+
125
+ # Load data for this row
126
+ x = tl.load(row_X_ptr + cols, mask=mask, other=0.0)
127
+ dy = tl.load(row_DY_ptr + cols, mask=mask, other=0.0)
128
+ mean = tl.load(row_Mean_ptr)
129
+ rstd = tl.load(row_RSTD_ptr)
130
+
131
+ # Convert to fp32 for numerical stability
132
+ x_f32 = x.to(tl.float32)
133
+ dy_f32 = dy.to(tl.float32)
134
+ mean_f32 = mean.to(tl.float32)
135
+ rstd_f32 = rstd.to(tl.float32)
136
+
137
+ # Compute backward pass for this row
138
+ x_hat = (x_f32 - mean_f32) * rstd_f32
139
+ wdy = w_f32 * dy_f32
140
+ c1 = tl.sum(x_hat * wdy, axis=0) / n_cols_f32
141
+ c2 = tl.sum(wdy, axis=0) / n_cols_f32
142
+ dx = (wdy - (x_hat * c1 + c2)) * rstd_f32
143
+
144
+ # Store input gradient
145
+ tl.store(row_DX_ptr + cols, dx.to(dtype), mask=mask)
146
+
147
+ # Accumulate weight and bias gradients using atomic operations
148
+ dw = dy_f32 * x_hat
149
+ db = dy_f32
150
+ tl.atomic_add(DW_ptr + cols, dw.to(atomic_dtype), mask=mask)
151
+ tl.atomic_add(DB_ptr + cols, db.to(atomic_dtype), mask=mask)
140
152
 
141
153
 
142
154
  def layer_norm_forward(X, W, B, eps):
155
+ """
156
+ Args:
157
+ X: Input tensor of shape (..., hidden_size)
158
+ W: Weight tensor of shape (hidden_size,)
159
+ B: Bias tensor of shape (hidden_size,)
160
+ eps: Small constant for numerical stability
161
+
162
+ Returns:
163
+ Tuple of (output, input, mean, rstd, block_size, num_warps)
164
+ """
143
165
  shape = X.shape
144
166
  dim = shape[-1]
145
167
  X = X.view(-1, dim)
146
168
  n_rows, n_cols = X.shape
169
+
170
+ # Calculate optimal block size and warp configuration
147
171
  BLOCK_SIZE, num_warps = calculate_settings(n_cols)
172
+
173
+ # Allocate output tensors
148
174
  Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
149
175
  Mean = torch.empty(n_rows, dtype=X.dtype, device=X.device)
150
176
  RSTD = torch.empty(n_rows, dtype=X.dtype, device=X.device)
177
+
178
+ # Validate input dimensions
151
179
  if X.shape[1] != W.shape[0]:
152
180
  raise ValueError(
153
181
  f"Incompatible dimensions: input feature size (X.shape[1]={X.shape[1]}) "
@@ -159,7 +187,9 @@ def layer_norm_forward(X, W, B, eps):
159
187
  if X.device.type == "xpu":
160
188
  kernel_args["grf_mode"] = "large"
161
189
 
162
- _layer_norm_forward_kernel[(n_rows,)](
190
+ # Launch kernel with one thread block per row for optimal performance
191
+ grid = (n_rows,)
192
+ _layer_norm_forward_kernel[grid](
163
193
  Y,
164
194
  Y.stride(0),
165
195
  X,
@@ -176,35 +206,43 @@ def layer_norm_forward(X, W, B, eps):
176
206
  eps,
177
207
  BLOCK_SIZE=BLOCK_SIZE,
178
208
  num_warps=num_warps,
179
- **kernel_args, # XPU-specific optimization
209
+ **kernel_args,
180
210
  )
211
+
181
212
  return Y.view(*shape), X, Mean, RSTD, BLOCK_SIZE, num_warps
182
213
 
183
214
 
184
215
  def layer_norm_backward(dY, X, W, B, Mean, RSTD):
216
+ """
217
+ Args:
218
+ dY: Gradient of output
219
+ X: Input tensor
220
+ W: Weight tensor
221
+ B: Bias tensor
222
+ Mean: Pre-computed mean
223
+ RSTD: Pre-computed reciprocal standard deviation
224
+
225
+ Returns:
226
+ Tuple of (input_grad, weight_grad, bias_grad)
227
+ """
185
228
  shape = dY.shape
186
229
  dim = shape[-1]
187
230
  dY = dY.view(-1, dim)
188
231
  n_rows, n_cols = dY.shape
189
232
 
190
- sm_count = 1
191
- if X.device.type == "cuda":
192
- sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
193
- elif X.device.type == "xpu":
194
- sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
195
-
233
+ # Allocate gradient tensors
196
234
  DX = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
197
- _DW = torch.empty((sm_count, n_cols), dtype=W.dtype, device=W.device)
198
- _DB = torch.empty((sm_count, n_cols), dtype=W.dtype, device=W.device)
235
+ # Use float32 for weight/bias gradients if bfloat16 (due to atomic_add limitation)
236
+ grad_dtype = torch.float32 if W.dtype == torch.bfloat16 else W.dtype
237
+ DW = torch.zeros(n_cols, dtype=grad_dtype, device=W.device)
238
+ DB = torch.zeros(n_cols, dtype=grad_dtype, device=W.device)
199
239
 
240
+ # Calculate optimal block size and warp configuration
200
241
  BLOCK_SIZE, num_warps = calculate_settings(n_cols)
201
242
  if n_cols > BLOCK_SIZE:
202
- raise RuntimeError(
203
- f"Feature dimension {n_cols} exceeds maximum supported size of {BLOCK_SIZE}. Consider using a smaller feature dimension."
204
- )
243
+ raise RuntimeError(f"Feature dimension {n_cols} exceeds maximum supported size of {BLOCK_SIZE}.")
205
244
 
206
- rows_per_program = math.ceil(n_rows / sm_count)
207
- grid = (sm_count,)
245
+ # Determine dtype for triton operations
208
246
  triton_dtype = (
209
247
  tl.float32
210
248
  if X.dtype == torch.float32
@@ -212,41 +250,41 @@ def layer_norm_backward(dY, X, W, B, Mean, RSTD):
212
250
  if X.dtype == torch.bfloat16
213
251
  else tl.float16
214
252
  if X.dtype == torch.float16
215
- else tl.float32 # fallback to float32 for other types
253
+ else tl.float32 # fallback
216
254
  )
217
255
 
256
+ # Use float32 for atomic operations if bfloat16 is not supported
257
+ atomic_dtype = tl.float32 if triton_dtype == tl.bfloat16 else triton_dtype
258
+
218
259
  # XPU-specific optimization
219
260
  kernel_args = {}
220
261
  if X.device.type == "xpu":
221
262
  kernel_args.update({"grf_mode": "large", "num_warps": 32, "num_stages": 4})
222
263
 
264
+ # Launch kernel with one thread block per row for optimal performance
265
+ grid = (n_rows,)
223
266
  _layer_norm_backward_kernel[grid](
224
267
  X,
225
268
  W,
226
269
  Mean,
227
270
  RSTD,
228
271
  DX,
229
- _DW,
230
- _DB,
272
+ DW,
273
+ DB,
231
274
  dY,
232
275
  X.stride(0),
233
276
  DX.stride(0),
234
- _DW.stride(0),
235
- _DB.stride(0),
236
277
  dY.stride(0),
237
- n_rows,
238
278
  n_cols,
239
- rows_per_program,
240
279
  BLOCK_SIZE=BLOCK_SIZE,
241
280
  dtype=triton_dtype,
242
- **kernel_args, # XPU-specific optimization
281
+ atomic_dtype=atomic_dtype,
282
+ num_warps=num_warps,
283
+ **kernel_args,
243
284
  )
244
285
 
245
- DW = _DW.sum(dim=0).to(W.dtype)
246
- DB = _DB.sum(dim=0).to(W.dtype)
247
-
248
286
  DX = DX.view(*shape)
249
- return DX, DW, DB
287
+ return DX, DW.to(W.dtype), DB.to(W.dtype)
250
288
 
251
289
 
252
290
  class LigerLayerNormFunction(torch.autograd.Function):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.6.0.dev20250718080702
3
+ Version: 0.6.0.dev20250719041256
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -16,12 +16,14 @@ device = infer_device()
16
16
  (4, 16, 128),
17
17
  (1, 1, 1023), # Minimal batch/seq with near power-of-2 hidden
18
18
  (3, 7, 256), # Prime numbers for batch/seq
19
+ (1, 1, 1500),
19
20
  ],
20
21
  )
21
22
  @pytest.mark.parametrize(
22
23
  "dtype, atol, rtol",
23
24
  [
24
25
  (torch.float32, 1e-5, 1e-5),
26
+ (torch.bfloat16, 2e-2, 2e-2), # Relaxed tolerance for bfloat16 due to lower precision + atomic limitations
25
27
  ],
26
28
  )
27
29
  def test_liger_layer_norm(
@@ -72,6 +74,7 @@ def test_liger_layer_norm(
72
74
  "dtype, atol, rtol",
73
75
  [
74
76
  (torch.float32, 1e-5, 1e-5),
77
+ (torch.bfloat16, 2e-2, 2e-2), # Relaxed tolerance for bfloat16 due to lower precision + atomic limitations
75
78
  ],
76
79
  )
77
80
  def test_liger_layer_norm_functional(
@@ -1634,6 +1634,7 @@ def test_apply_liger_kernel_to_instance_for_glm4():
1634
1634
  pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
1635
1635
 
1636
1636
 
1637
+ @pytest.mark.skipif(not is_smollm3_available(), reason="smollm3 module not available")
1637
1638
  def test_apply_liger_kernel_to_instance_for_smollm3():
1638
1639
  # Ensure any monkey patching is cleaned up for subsequent tests
1639
1640
  with patch("transformers.models.smollm3.modeling_smollm3"):