liger-kernel-nightly 0.6.3.dev20251121010234__tar.gz → 0.6.4.dev20251208235806__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (306) hide show
  1. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/PKG-INFO +8 -1
  2. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/README.md +7 -0
  3. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/dev/modal/tests.py +1 -1
  4. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/dev/modal/tests_bwd.py +1 -1
  5. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/pyproject.toml +1 -1
  6. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/setup.py +20 -1
  7. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +21 -5
  8. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/grpo_loss.py +8 -5
  9. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/cross_entropy.py +2 -1
  10. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/dyt.py +5 -2
  11. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/fused_add_rms_norm.py +5 -1
  12. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/geglu.py +2 -1
  13. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/group_norm.py +2 -1
  14. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/grpo_loss.py +3 -1
  15. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/layer_norm.py +86 -66
  16. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/poly_norm.py +5 -1
  17. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/rms_norm.py +7 -2
  18. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/utils.py +2 -0
  19. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/__init__.py +12 -0
  20. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/grpo_loss.py +56 -1
  21. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/gemma3.py +1 -0
  22. liger_kernel_nightly-0.6.4.dev20251208235806/src/liger_kernel/transformers/model/gpt_oss.py +211 -0
  23. liger_kernel_nightly-0.6.4.dev20251208235806/src/liger_kernel/transformers/model/hunyuan_v1.py +134 -0
  24. liger_kernel_nightly-0.6.4.dev20251208235806/src/liger_kernel/transformers/model/olmo3.py +142 -0
  25. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/paligemma.py +1 -0
  26. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/monkey_patch.py +263 -0
  27. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/swiglu.py +17 -0
  28. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/utils.py +25 -0
  29. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel_nightly.egg-info/PKG-INFO +8 -1
  30. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel_nightly.egg-info/SOURCES.txt +3 -0
  31. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/chunked_loss/test_grpo_loss.py +224 -47
  32. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/conftest.py +4 -0
  33. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/convergence/bf16/test_mini_models.py +241 -0
  34. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/convergence/bf16/test_mini_models_with_logits.py +175 -0
  35. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/convergence/fp32/test_mini_models.py +223 -0
  36. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/convergence/fp32/test_mini_models_with_logits.py +164 -0
  37. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_layer_norm.py +1 -0
  38. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_monkey_patch.py +151 -0
  39. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/utils.py +53 -0
  40. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  41. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  42. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/.github/pull_request_template.md +0 -0
  43. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/.github/workflows/amd-ci.yml +0 -0
  44. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/.github/workflows/benchmark.yml +0 -0
  45. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/.github/workflows/docs.yml +0 -0
  46. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/.github/workflows/intel-ci.yml +0 -0
  47. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/.github/workflows/nvi-ci.yml +0 -0
  48. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/.github/workflows/publish-nightly.yml +0 -0
  49. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/.github/workflows/publish-release.yml +0 -0
  50. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/.gitignore +0 -0
  51. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/LICENSE +0 -0
  52. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/Makefile +0 -0
  53. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/NOTICE +0 -0
  54. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/README.md +0 -0
  55. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/__init__.py +0 -0
  56. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/benchmarks_visualizer.py +0 -0
  57. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/data/all_benchmark_data.csv +0 -0
  58. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/__init__.py +0 -0
  59. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
  60. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
  61. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
  62. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
  63. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
  64. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_dyt.py +0 -0
  65. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_embedding.py +0 -0
  66. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_fused_add_rms_norm.py +0 -0
  67. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
  68. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
  69. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
  70. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_geglu.py +0 -0
  71. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_group_norm.py +0 -0
  72. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_grpo_loss.py +0 -0
  73. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_jsd.py +0 -0
  74. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_kl_div.py +0 -0
  75. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_kto_loss.py +0 -0
  76. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_layer_norm.py +0 -0
  77. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_llama4_rope.py +0 -0
  78. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
  79. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
  80. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_poly_norm.py +0 -0
  81. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
  82. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_rms_norm.py +0 -0
  83. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_rope.py +0 -0
  84. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
  85. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_softmax.py +0 -0
  86. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
  87. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_sparsemax.py +0 -0
  88. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_swiglu.py +0 -0
  89. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_tiled_mlp.py +0 -0
  90. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/benchmark_tvd.py +0 -0
  91. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/benchmark/scripts/utils.py +0 -0
  92. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/dev/fmt-requirements.txt +0 -0
  93. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/dev/modal/benchmarks.py +0 -0
  94. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/Examples.md +0 -0
  95. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/Getting-Started.md +0 -0
  96. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/High-Level-APIs.md +0 -0
  97. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/Low-Level-APIs.md +0 -0
  98. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/acknowledgement.md +0 -0
  99. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/contributing.md +0 -0
  100. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/images/banner.GIF +0 -0
  101. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/images/compose.gif +0 -0
  102. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/images/e2e-memory.png +0 -0
  103. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/images/e2e-tps.png +0 -0
  104. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/images/logo-banner.png +0 -0
  105. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/images/patch.gif +0 -0
  106. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/images/post-training.png +0 -0
  107. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/index.md +0 -0
  108. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/docs/license.md +0 -0
  109. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/alignment/accelerate_config.yaml +0 -0
  110. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/alignment/run_orpo.py +0 -0
  111. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/README.md +0 -0
  112. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/callback.py +0 -0
  113. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/config/fsdp_config.json +0 -0
  114. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/img/gemma_7b_mem.png +0 -0
  115. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/img/gemma_7b_tp.png +0 -0
  116. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/img/llama_mem_alloc.png +0 -0
  117. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/img/llama_tps.png +0 -0
  118. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
  119. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/img/qwen_tps.png +0 -0
  120. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/launch_on_modal.py +0 -0
  121. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/requirements.txt +0 -0
  122. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/run_benchmarks.sh +0 -0
  123. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/run_gemma.sh +0 -0
  124. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/run_llama.sh +0 -0
  125. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/run_qwen.sh +0 -0
  126. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/run_qwen2_vl.sh +0 -0
  127. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/training.py +0 -0
  128. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/huggingface/training_multimodal.py +0 -0
  129. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/lightning/README.md +0 -0
  130. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/lightning/requirements.txt +0 -0
  131. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/lightning/training.py +0 -0
  132. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/README.md +0 -0
  133. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/callback.py +0 -0
  134. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
  135. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
  136. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
  137. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
  138. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
  139. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
  140. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
  141. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
  142. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
  143. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/medusa_util.py +0 -0
  144. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/requirements.txt +0 -0
  145. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
  146. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/examples/medusa/train.py +0 -0
  147. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/licenses/LICENSE-Apache-2.0 +0 -0
  148. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/licenses/LICENSE-MIT-AutoAWQ +0 -0
  149. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
  150. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/licenses/LICENSE-MIT-llmc +0 -0
  151. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/licenses/LICENSE-MIT-triton +0 -0
  152. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/mkdocs.yml +0 -0
  153. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/setup.cfg +0 -0
  154. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/__init__.py +0 -0
  155. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/README.md +0 -0
  156. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/__init__.py +0 -0
  157. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
  158. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
  159. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
  160. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/functional.py +0 -0
  161. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
  162. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
  163. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
  164. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
  165. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
  166. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
  167. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
  168. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/env_report.py +0 -0
  169. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/__init__.py +0 -0
  170. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  171. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
  172. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
  173. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
  174. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
  175. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/jsd.py +0 -0
  176. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/kl_div.py +0 -0
  177. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/llama4_rope.py +0 -0
  178. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/multi_token_attention.py +0 -0
  179. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
  180. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/rope.py +0 -0
  181. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/softmax.py +0 -0
  182. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/sparsemax.py +0 -0
  183. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/swiglu.py +0 -0
  184. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/tiled_mlp.py +0 -0
  185. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/tvd.py +0 -0
  186. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/auto_model.py +0 -0
  187. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/cross_entropy.py +0 -0
  188. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/dyt.py +0 -0
  189. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/experimental/__init__.py +0 -0
  190. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  191. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/fsdp.py +0 -0
  192. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/functional.py +0 -0
  193. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/fused_add_rms_norm.py +0 -0
  194. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
  195. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
  196. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
  197. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/geglu.py +0 -0
  198. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/group_norm.py +0 -0
  199. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/jsd.py +0 -0
  200. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/kl_div.py +0 -0
  201. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/layer_norm.py +0 -0
  202. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/llama4_rope.py +0 -0
  203. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/__init__.py +0 -0
  204. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/falcon_h1.py +0 -0
  205. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/gemma.py +0 -0
  206. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/gemma2.py +0 -0
  207. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/glm4.py +0 -0
  208. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/glm4v.py +0 -0
  209. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/glm4v_moe.py +0 -0
  210. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/internvl.py +0 -0
  211. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/llama.py +0 -0
  212. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/llama4.py +0 -0
  213. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/llava.py +0 -0
  214. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
  215. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/mistral.py +0 -0
  216. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/mixtral.py +0 -0
  217. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/mllama.py +0 -0
  218. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/olmo2.py +0 -0
  219. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/output_classes.py +0 -0
  220. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/phi3.py +0 -0
  221. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/qwen2.py +0 -0
  222. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
  223. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
  224. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/qwen3.py +0 -0
  225. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
  226. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/qwen3_next.py +0 -0
  227. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/qwen3_vl.py +0 -0
  228. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/qwen3_vl_moe.py +0 -0
  229. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/smollm3.py +0 -0
  230. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/smolvlm.py +0 -0
  231. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
  232. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/poly_norm.py +0 -0
  233. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
  234. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/rms_norm.py +0 -0
  235. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/rope.py +0 -0
  236. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/softmax.py +0 -0
  237. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/sparsemax.py +0 -0
  238. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/tiled_mlp.py +0 -0
  239. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
  240. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
  241. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  242. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/tvd.py +0 -0
  243. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/triton/__init__.py +0 -0
  244. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/triton/monkey_patch.py +0 -0
  245. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
  246. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
  247. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
  248. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/__init__.py +0 -0
  249. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/chunked_loss/__init__.py +0 -0
  250. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/chunked_loss/test_cosine_loss.py +0 -0
  251. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/chunked_loss/test_cpo_loss.py +0 -0
  252. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/chunked_loss/test_dpo_loss.py +0 -0
  253. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/chunked_loss/test_jsd_loss.py +0 -0
  254. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/chunked_loss/test_kto_loss.py +0 -0
  255. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/chunked_loss/test_orpo_loss.py +0 -0
  256. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/chunked_loss/test_simpo_loss.py +0 -0
  257. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/convergence/__init__.py +0 -0
  258. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/convergence/bf16/__init__.py +0 -0
  259. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/convergence/bf16/test_mini_models_multimodal.py +0 -0
  260. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/convergence/fp32/__init__.py +0 -0
  261. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
  262. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
  263. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
  264. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/fake_configs/HuggingFaceTB/SmolVLM2-256M-Video-Instruct/tokenizer_config.json +0 -0
  265. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
  266. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
  267. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
  268. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/fake_configs/OpenGVLab/InternVL3-1B-hf/tokenizer_config.json +0 -0
  269. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
  270. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
  271. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/fake_configs/Qwen/Qwen3-VL-4B-Instruct/tokenizer_config.json +0 -0
  272. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
  273. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
  274. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
  275. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/tiny_shakespeare.txt +0 -0
  276. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
  277. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
  278. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
  279. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_auto_model.py +0 -0
  280. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_cross_entropy.py +0 -0
  281. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_dyt.py +0 -0
  282. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_embedding.py +0 -0
  283. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_flex_attention.py +0 -0
  284. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_fused_add_rms_norm.py +0 -0
  285. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
  286. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_fused_linear_jsd.py +0 -0
  287. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_fused_neighborhood_attention.py +0 -0
  288. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_geglu.py +0 -0
  289. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_group_norm.py +0 -0
  290. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_grpo_loss.py +0 -0
  291. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_jsd.py +0 -0
  292. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_kl_div.py +0 -0
  293. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_mm_int8int2.py +0 -0
  294. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_multi_token_attention.py +0 -0
  295. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_poly_norm.py +0 -0
  296. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_qwen2vl_mrope.py +0 -0
  297. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_rms_norm.py +0 -0
  298. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_rope.py +0 -0
  299. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_softmax.py +0 -0
  300. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_sparsemax.py +0 -0
  301. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_swiglu.py +0 -0
  302. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_tiled_mlp.py +0 -0
  303. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_trainer_integration.py +0 -0
  304. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_transformers.py +0 -0
  305. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/transformers/test_tvd.py +0 -0
  306. {liger_kernel_nightly-0.6.3.dev20251121010234 → liger_kernel_nightly-0.6.4.dev20251208235806}/test/triton/test_triton_monkey_patch.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.6.3.dev20251121010234
3
+ Version: 0.6.4.dev20251208235806
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -113,6 +113,8 @@ We've also added optimized Post-Training kernels that deliver **up to 80% memory
113
113
 
114
114
  You can view the documentation site for additional installation, usage examples, and API references:https://linkedin.github.io/Liger-Kernel/
115
115
 
116
+ You can view the Liger Kernel Technical Report: https://openreview.net/forum?id=36SjAIT42G
117
+
116
118
  ## Supercharge Your Model with Liger Kernel
117
119
 
118
120
  ![Banner](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/banner.GIF)
@@ -310,8 +312,12 @@ loss.backward()
310
312
  | Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
311
313
  | Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
312
314
  | OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
315
+ | Olmo3 | `liger_kernel.transformers.apply_liger_kernel_to_olmo3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
313
316
  | GLM-4 | `liger_kernel.transformers.apply_liger_kernel_to_glm4` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
317
+ | GPT-OSS | `liger_kernel.transformers.apply_liger_kernel_to_gpt_oss` | RoPE, RMSNorm, CrossEntropyLoss, FusedLinearCrossEntropy |
314
318
  | InternVL3 | `liger_kernel.transformers.apply_liger_kernel_to_internvl` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
319
+ | HunyuanV1 | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_dense` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
320
+ | HunyuanV1 MoE | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
315
321
 
316
322
 
317
323
  ## Low-level APIs
@@ -438,3 +444,4 @@ url={https://openreview.net/forum?id=36SjAIT42G}
438
444
  ↑ Back to Top ↑
439
445
  </a>
440
446
  </p>
447
+
@@ -65,6 +65,8 @@ We've also added optimized Post-Training kernels that deliver **up to 80% memory
65
65
 
66
66
  You can view the documentation site for additional installation, usage examples, and API references:https://linkedin.github.io/Liger-Kernel/
67
67
 
68
+ You can view the Liger Kernel Technical Report: https://openreview.net/forum?id=36SjAIT42G
69
+
68
70
  ## Supercharge Your Model with Liger Kernel
69
71
 
70
72
  ![Banner](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/banner.GIF)
@@ -262,8 +264,12 @@ loss.backward()
262
264
  | Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
263
265
  | Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
264
266
  | OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
267
+ | Olmo3 | `liger_kernel.transformers.apply_liger_kernel_to_olmo3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
265
268
  | GLM-4 | `liger_kernel.transformers.apply_liger_kernel_to_glm4` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
269
+ | GPT-OSS | `liger_kernel.transformers.apply_liger_kernel_to_gpt_oss` | RoPE, RMSNorm, CrossEntropyLoss, FusedLinearCrossEntropy |
266
270
  | InternVL3 | `liger_kernel.transformers.apply_liger_kernel_to_internvl` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
271
+ | HunyuanV1 | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_dense` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
272
+ | HunyuanV1 MoE | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
267
273
 
268
274
 
269
275
  ## Low-level APIs
@@ -390,3 +396,4 @@ url={https://openreview.net/forum?id=36SjAIT42G}
390
396
  ↑ Back to Top ↑
391
397
  </a>
392
398
  </p>
399
+
@@ -14,7 +14,7 @@ app = modal.App("liger_tests", image=image)
14
14
  repo = image.add_local_dir(ROOT_PATH, remote_path=REMOTE_ROOT_PATH)
15
15
 
16
16
 
17
- @app.function(gpu="H100!", image=repo, timeout=60 * 60)
17
+ @app.function(gpu="H100!", image=repo, timeout=90 * 60)
18
18
  def liger_tests():
19
19
  import subprocess
20
20
 
@@ -14,7 +14,7 @@ app = modal.App("liger_tests_bwd", image=image)
14
14
  repo = image.add_local_dir(ROOT_PATH, remote_path=REMOTE_ROOT_PATH)
15
15
 
16
16
 
17
- @app.function(gpu="H100!", image=repo, timeout=60 * 60)
17
+ @app.function(gpu="H100!", image=repo, timeout=90 * 60)
18
18
  def liger_bwd_tests():
19
19
  import subprocess
20
20
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "liger_kernel_nightly"
7
- version = "0.6.3.dev20251121010234"
7
+ version = "0.6.4.dev20251208235806"
8
8
  description = "Efficient Triton kernels for LLM Training"
9
9
  urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -24,6 +24,8 @@ def get_default_dependencies():
24
24
  return [
25
25
  "torch>=2.6.0",
26
26
  ]
27
+ elif platform == "npu":
28
+ return ["torch_npu==2.6.0", "triton-ascend"]
27
29
 
28
30
 
29
31
  def get_optional_dependencies():
@@ -67,7 +69,21 @@ def is_xpu_available():
67
69
  return False
68
70
 
69
71
 
70
- def get_platform() -> Literal["cuda", "rocm", "cpu", "xpu"]:
72
+ def is_ascend_available() -> bool:
73
+ """Best-effort Ascend detection.
74
+
75
+ Checks for common Ascend environment variables and a possible `npu-smi`
76
+ utility if present.
77
+ """
78
+ try:
79
+ subprocess.run(["npu-smi", "info"], check=True)
80
+ return True
81
+ except (subprocess.SubprocessError, FileNotFoundError):
82
+ pass
83
+ return False
84
+
85
+
86
+ def get_platform() -> Literal["cuda", "rocm", "cpu", "xpu", "npu"]:
71
87
  """
72
88
  Detect whether the system has NVIDIA or AMD GPU without torch dependency.
73
89
  """
@@ -86,6 +102,9 @@ def get_platform() -> Literal["cuda", "rocm", "cpu", "xpu"]:
86
102
  if is_xpu_available():
87
103
  print("Intel GPU detected")
88
104
  return "xpu"
105
+ elif is_ascend_available():
106
+ print("Ascend NPU detected")
107
+ return "npu"
89
108
  else:
90
109
  print("No GPU detected")
91
110
  return "cpu"
@@ -32,7 +32,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
32
32
  epsilon_low=0.2,
33
33
  epsilon_high=0.2,
34
34
  beta=0.04,
35
- loss_type="bnpo",
35
+ loss_type="dapo",
36
36
  max_completion_length=None,
37
37
  importance_sampling_level="token",
38
38
  temperature=1.0,
@@ -60,7 +60,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
60
60
  epsilon_low: Lower bound for clipping the importance sampling ratio
61
61
  epsilon_high: Upper bound for clipping the importance sampling ratio
62
62
  beta: Weight for the KL penalty
63
- loss_type: Type of loss calculation ("grpo", "bnpo", "dr_grpo")
63
+ loss_type: Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo")
64
64
  max_completion_length: Maximum completion length required for "dr_grpo"
65
65
  temperature: Temperature for the logits
66
66
  compiled: Whether to use torch compile
@@ -244,6 +244,21 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
244
244
 
245
245
  return loss_acc, tuple(final_metrics)
246
246
 
247
+ @staticmethod
248
+ def _compute_dapo_normalizer(attention_mask):
249
+ """Global active tokens averaged per process."""
250
+ normalizer = attention_mask.to(torch.float32).sum()
251
+ world_size = 1
252
+ if torch.distributed.is_available() and torch.distributed.is_initialized():
253
+ import torch.distributed as dist
254
+
255
+ normalizer = normalizer.clone()
256
+ dist.all_reduce(normalizer, op=dist.ReduceOp.SUM)
257
+ world_size = dist.get_world_size()
258
+
259
+ normalizer = normalizer / world_size
260
+ return torch.clamp(normalizer, min=1.0)
261
+
247
262
  @staticmethod
248
263
  def _compute_chunk_loss(
249
264
  input_chunk,
@@ -261,7 +276,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
261
276
  epsilon_low=0.2,
262
277
  epsilon_high=0.2,
263
278
  beta=0.04,
264
- loss_type="bnpo",
279
+ loss_type="dapo",
265
280
  max_completion_length=None,
266
281
  importance_sampling_level="token",
267
282
  temperature=1.0,
@@ -341,10 +356,11 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
341
356
  None, # grad_epsilon_low
342
357
  None, # grad_epsilon_high
343
358
  None, # grad_beta
359
+ None, # grad_loss_type
360
+ None, # grad_max_completion_length
361
+ None, # grad_importance_sampling_level
344
362
  None, # grad_temperature
345
363
  None, # grad_compiled
346
364
  None, # grad_use_ref_model
347
365
  None, # grad_chunk_size
348
- None, # grad_loss_type
349
- None, # grad_max_completion_length
350
366
  )
@@ -29,7 +29,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
29
29
  epsilon_low=0.2,
30
30
  epsilon_high=0.2,
31
31
  beta=0.04,
32
- loss_type="bnpo", # ["grpo", "bnpo", "dr_grpo"]
32
+ loss_type="dapo", # ["grpo", "bnpo", "dr_grpo", "dapo"]
33
33
  max_completion_length=None, # Required for dr_grpo
34
34
  importance_sampling_level="token", # ["token", "sequence"] - new parameter for GSPO
35
35
  **kwargs,
@@ -94,6 +94,9 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
94
94
  if max_completion_length is None:
95
95
  raise ValueError("max_completion_length must be provided for loss_type 'dr_grpo'")
96
96
  loss = (per_token_loss * attention_mask).sum() / (full_attention_mask.shape[0] * max_completion_length)
97
+ elif loss_type == "dapo":
98
+ loss_normalizer = LigerFusedLinearPPOBase._compute_dapo_normalizer(full_attention_mask)
99
+ loss = (per_token_loss * attention_mask).sum() / loss_normalizer
97
100
  else:
98
101
  raise ValueError(f"Unknown loss type: {loss_type}")
99
102
 
@@ -135,7 +138,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
135
138
  beta=0.04,
136
139
  epsilon_low=0.2,
137
140
  epsilon_high=0.2,
138
- loss_type="bnpo",
141
+ loss_type="dapo",
139
142
  max_completion_length=None,
140
143
  importance_sampling_level="token",
141
144
  temperature=1.0,
@@ -157,7 +160,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
157
160
  ref_weight (torch.Tensor, optional): Reference model weight tensor. Shape: (vocab_size, hidden_size)
158
161
  ref_bias (torch.Tensor, optional): Reference model bias tensor. Shape: (vocab_size,)
159
162
  beta (float): Weight for the KL penalty
160
- loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
163
+ loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo"). Defaults to "dapo".
161
164
  max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
162
165
  importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
163
166
  temperature (float): Temperature for the logits
@@ -235,7 +238,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
235
238
  chunk_size: int = 1,
236
239
  epsilon_low: float = 0.2,
237
240
  epsilon_high: float = 0.2,
238
- loss_type: str = "bnpo",
241
+ loss_type: str = "dapo",
239
242
  max_completion_length: Optional[int] = None,
240
243
  importance_sampling_level: str = "token",
241
244
  temperature: float = 1.0,
@@ -248,7 +251,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
248
251
  chunk_size (int): Size of chunks for processing.
249
252
  epsilon_low (float): Lower bound for the importance sampling ratio.
250
253
  epsilon_high (float): Upper bound for the importance sampling ratio.
251
- loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
254
+ loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo"). Defaults to "dapo".
252
255
  max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
253
256
  importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
254
257
  temperature (float): Temperature for the logits.
@@ -10,8 +10,9 @@ from liger_kernel.ops.utils import compare_version
10
10
  from liger_kernel.ops.utils import element_mul_kernel
11
11
  from liger_kernel.ops.utils import is_hip
12
12
  from liger_kernel.utils import infer_device
13
+ from liger_kernel.utils import is_npu_available
13
14
 
14
- if compare_version("triton", operator.ge, "3.0.0"):
15
+ if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
15
16
  try:
16
17
  # typical import path with dispatch available
17
18
  from triton.language.extra.libdevice import tanh
@@ -7,8 +7,10 @@ import triton.language as tl
7
7
  from liger_kernel.ops.utils import compare_version
8
8
  from liger_kernel.ops.utils import ensure_contiguous
9
9
  from liger_kernel.ops.utils import infer_device
10
+ from liger_kernel.utils import get_npu_multi_processor_count
11
+ from liger_kernel.utils import is_npu_available
10
12
 
11
- if compare_version("triton", operator.ge, "3.0.0"):
13
+ if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
12
14
  try:
13
15
  # typical import path with dispatch available
14
16
  from triton.language.extra.libdevice import tanh
@@ -125,7 +127,8 @@ def liger_dyt_bwd(dy, x, alpha, gamma, beta):
125
127
  NUM_SMS = torch.cuda.get_device_properties(x.device).multi_processor_count
126
128
  elif device == "xpu":
127
129
  NUM_SMS = torch.xpu.get_device_properties(x.device).gpu_subslice_count
128
-
130
+ elif device == "npu":
131
+ NUM_SMS = get_npu_multi_processor_count()
129
132
  da = torch.zeros(NUM_SMS, triton.cdiv(N, 512), dtype=torch.float32, device=x.device)
130
133
  dg = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device)
131
134
  db = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device) if HAVE_BETA else None
@@ -9,8 +9,10 @@ from liger_kernel.ops.utils import calculate_settings
9
9
  from liger_kernel.ops.utils import compare_version
10
10
  from liger_kernel.ops.utils import ensure_contiguous
11
11
  from liger_kernel.ops.utils import torch_to_triton_dtype
12
+ from liger_kernel.utils import get_npu_multi_processor_count
13
+ from liger_kernel.utils import is_npu_available
12
14
 
13
- if compare_version("triton", operator.ge, "3.0.0"):
15
+ if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
14
16
  try:
15
17
  # typical import path with dispatch available
16
18
  from triton.language.extra.libdevice import rsqrt
@@ -293,6 +295,8 @@ def fused_add_rms_norm_backward(dY, dS_out, S, W, RSTD, offset, casting_mode, BL
293
295
  sm_count = torch.cuda.get_device_properties(S.device).multi_processor_count
294
296
  elif S.device.type == "xpu":
295
297
  sm_count = torch.xpu.get_device_properties(S.device).gpu_eu_count
298
+ elif S.device.type == "npu":
299
+ sm_count = get_npu_multi_processor_count()
296
300
 
297
301
  # fp32 for numerical stability especially.
298
302
  _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
@@ -7,8 +7,9 @@ import triton.language as tl
7
7
  from liger_kernel.ops.utils import calculate_settings
8
8
  from liger_kernel.ops.utils import compare_version
9
9
  from liger_kernel.ops.utils import ensure_contiguous
10
+ from liger_kernel.utils import is_npu_available
10
11
 
11
- if compare_version("triton", operator.ge, "3.0.0"):
12
+ if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
12
13
  try:
13
14
  # typical import path with dispatch available
14
15
  from triton.language.extra.libdevice import tanh
@@ -6,8 +6,9 @@ import triton.language as tl
6
6
 
7
7
  from liger_kernel.ops.utils import compare_version
8
8
  from liger_kernel.ops.utils import ensure_contiguous
9
+ from liger_kernel.utils import is_npu_available
9
10
 
10
- if compare_version("triton", operator.ge, "3.0.0"):
11
+ if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
11
12
  try:
12
13
  # typical import path with dispatch available
13
14
  from triton.language.extra.libdevice import rsqrt
@@ -128,7 +128,9 @@ def _grpo_loss_fwd_kernel(
128
128
  per_token_loss1 = coef_1 * advantage
129
129
  per_token_loss2 = coef_2 * advantage
130
130
  per_token_loss = -tl.minimum(per_token_loss1, per_token_loss2)
131
- is_clipped = per_token_loss1 < per_token_loss2
131
+ is_low_clipped = (coef_1 < 1 - EPS_LOW) & (advantage < 0)
132
+ is_high_clipped = (coef_1 > 1 + EPS_HIGH) & (advantage > 0)
133
+ is_clipped = is_low_clipped | is_high_clipped
132
134
 
133
135
  if BETA != 0.0:
134
136
  REF_LOGP += off_b * L + off_l
@@ -1,3 +1,4 @@
1
+ import math
1
2
  import operator
2
3
 
3
4
  import torch
@@ -7,8 +8,9 @@ import triton.language as tl
7
8
  from liger_kernel.ops.utils import calculate_settings
8
9
  from liger_kernel.ops.utils import compare_version
9
10
  from liger_kernel.ops.utils import ensure_contiguous
11
+ from liger_kernel.utils import is_npu_available
10
12
 
11
- if compare_version("triton", operator.ge, "3.0.0"):
13
+ if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
12
14
  try:
13
15
  # typical import path with dispatch available
14
16
  from triton.language.extra.libdevice import rsqrt
@@ -85,68 +87,87 @@ def _layer_norm_forward_kernel(
85
87
  @triton.jit
86
88
  def _layer_norm_backward_kernel(
87
89
  X_ptr, # pointer to input, shape (n_rows, n_cols)
90
+ stride_x, # stride of each row in input
88
91
  W_ptr, # pointer to weights, shape (n_cols,)
89
92
  Mean_ptr, # pointer to mean, shape (n_rows,)
93
+ stride_mean, # stride of each row in mean
90
94
  RSTD_ptr, # pointer to rstd, shape (n_rows,)
95
+ stride_rstd, # stride of each row in rstd
91
96
  DX_ptr, # pointer to input grad, shape (n_rows, n_cols)
97
+ stride_dx, # stride of each row in input grad
92
98
  DW_ptr, # pointer to weights grad, shape (n_cols,)
99
+ stride_dw, # stride of each row in weights grad
93
100
  DB_ptr, # pointer to bias grad, shape (n_cols,)
101
+ stride_db, # stride of each row in bias grad
94
102
  DY_ptr, # pointer to output grad, shape (n_rows, n_cols)
95
- stride_x, # stride of each row in input
96
- stride_dx, # stride of each row in input grad
97
103
  stride_dy, # stride of each row in output grad
104
+ n_rows,
98
105
  n_cols,
106
+ rows_per_program: tl.constexpr,
99
107
  BLOCK_SIZE: tl.constexpr,
100
- dtype: tl.constexpr,
101
- atomic_dtype: tl.constexpr,
102
108
  ):
103
109
  """
104
110
  References:
105
111
  https://arxiv.org/abs/1607.06450
106
112
  https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
107
113
  """
108
- row_idx = tl.program_id(0).to(tl.int64)
114
+ row_block_id = tl.program_id(0).to(tl.int64)
115
+ row_start = row_block_id * rows_per_program
116
+ row_end = min((row_block_id + 1) * rows_per_program, n_rows)
109
117
  cols = tl.arange(0, BLOCK_SIZE)
110
118
  mask = cols < n_cols
111
119
 
120
+ dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
121
+ db_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
122
+
112
123
  # Pre-load weights once (same optimization as forward pass)
113
124
  w = tl.load(W_ptr + cols, mask=mask, other=0.0)
114
125
  w_f32 = w.to(tl.float32)
115
126
 
116
127
  # Calculate pointers for this specific row
117
- row_X_ptr = X_ptr + row_idx * stride_x
118
- row_DX_ptr = DX_ptr + row_idx * stride_dx
119
- row_DY_ptr = DY_ptr + row_idx * stride_dy
120
- row_Mean_ptr = Mean_ptr + row_idx
121
- row_RSTD_ptr = RSTD_ptr + row_idx
122
-
123
- # Load data for this row
124
- x = tl.load(row_X_ptr + cols, mask=mask, other=0.0)
125
- dy = tl.load(row_DY_ptr + cols, mask=mask, other=0.0)
126
- mean = tl.load(row_Mean_ptr)
127
- rstd = tl.load(row_RSTD_ptr)
128
-
129
- # Convert to fp32 for numerical stability
130
- x_f32 = x.to(tl.float32)
131
- dy_f32 = dy.to(tl.float32)
132
- mean_f32 = mean.to(tl.float32)
133
- rstd_f32 = rstd.to(tl.float32)
134
-
135
- # Compute backward pass for this row
136
- x_hat = (x_f32 - mean_f32) * rstd_f32
137
- wdy = w_f32 * dy_f32
138
- c1 = tl.sum(x_hat * wdy, axis=0) / n_cols
139
- c2 = tl.sum(wdy, axis=0) / n_cols
140
- dx = (wdy - (x_hat * c1 + c2)) * rstd_f32
141
-
142
- # Store input gradient
143
- tl.store(row_DX_ptr + cols, dx.to(dtype), mask=mask)
144
-
145
- # Accumulate weight and bias gradients using atomic operations
146
- dw = dy_f32 * x_hat
147
- db = dy_f32
148
- tl.atomic_add(DW_ptr + cols, dw.to(atomic_dtype), mask=mask)
149
- tl.atomic_add(DB_ptr + cols, db.to(atomic_dtype), mask=mask)
128
+ row_X_ptr = X_ptr + row_start * stride_x
129
+ row_DX_ptr = DX_ptr + row_start * stride_dx
130
+ row_DY_ptr = DY_ptr + row_start * stride_dy
131
+ row_Mean_ptr = Mean_ptr + row_start
132
+ row_RSTD_ptr = RSTD_ptr + row_start
133
+
134
+ for _ in range(row_start, row_end):
135
+ # Load data for this row
136
+ x = tl.load(row_X_ptr + cols, mask=mask, other=0.0)
137
+ dy = tl.load(row_DY_ptr + cols, mask=mask, other=0.0)
138
+ mean = tl.load(row_Mean_ptr)
139
+ rstd = tl.load(row_RSTD_ptr)
140
+
141
+ # Convert to fp32 for numerical stability
142
+ x_f32 = x.to(tl.float32)
143
+ dy_f32 = dy.to(tl.float32)
144
+ mean_f32 = mean.to(tl.float32)
145
+ rstd_f32 = rstd.to(tl.float32)
146
+
147
+ # Compute backward pass for this row
148
+ x_hat = (x_f32 - mean_f32) * rstd_f32
149
+ wdy = w_f32 * dy_f32
150
+ c1 = tl.sum(x_hat * wdy, axis=0) / n_cols
151
+ c2 = tl.sum(wdy, axis=0) / n_cols
152
+ dx = (wdy - (x_hat * c1 + c2)) * rstd_f32
153
+
154
+ # Store input gradient
155
+ tl.store(row_DX_ptr + cols, dx, mask=mask)
156
+
157
+ # Accumulate weight and bias gradients for this thread block's assigned rows
158
+ dw = dy_f32 * x_hat
159
+ db = dy_f32
160
+ dW_row += dw
161
+ db_row += db
162
+
163
+ row_X_ptr += stride_x
164
+ row_DX_ptr += stride_dx
165
+ row_DY_ptr += stride_dy
166
+ row_Mean_ptr += stride_mean
167
+ row_RSTD_ptr += stride_rstd
168
+
169
+ tl.store(DW_ptr + row_block_id * stride_dw + cols, dW_row, mask=mask)
170
+ tl.store(DB_ptr + row_block_id * stride_db + cols, db_row, mask=mask)
150
171
 
151
172
 
152
173
  def layer_norm_forward(X, W, B, eps):
@@ -228,31 +249,25 @@ def layer_norm_backward(dY, X, W, B, Mean, RSTD):
228
249
  dY = dY.view(-1, dim)
229
250
  n_rows, n_cols = dY.shape
230
251
 
231
- # Allocate gradient tensors
232
- DX = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
233
- # Use float32 for weight/bias gradients if bfloat16 (due to atomic_add limitation)
234
- grad_dtype = torch.float32 if W.dtype == torch.bfloat16 else W.dtype
235
- DW = torch.zeros(n_cols, dtype=grad_dtype, device=W.device)
236
- DB = torch.zeros(n_cols, dtype=grad_dtype, device=W.device)
252
+ sm_count = 1
253
+ if X.device.type == "cuda":
254
+ sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
255
+ elif X.device.type == "xpu":
256
+ sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
257
+
258
+ # fp32 for numerical stability especially.
259
+ _DW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
260
+ _DB = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
237
261
 
238
262
  # Calculate optimal block size and warp configuration
239
263
  BLOCK_SIZE, num_warps = calculate_settings(n_cols)
240
264
  if n_cols > BLOCK_SIZE:
241
265
  raise RuntimeError(f"Feature dimension {n_cols} exceeds maximum supported size of {BLOCK_SIZE}.")
266
+ rows_per_program = math.ceil(n_rows / sm_count)
267
+ grid = (sm_count,)
242
268
 
243
- # Determine dtype for triton operations
244
- triton_dtype = (
245
- tl.float32
246
- if X.dtype == torch.float32
247
- else tl.bfloat16
248
- if X.dtype == torch.bfloat16
249
- else tl.float16
250
- if X.dtype == torch.float16
251
- else tl.float32 # fallback
252
- )
253
-
254
- # Use float32 for atomic operations if bfloat16 is not supported
255
- atomic_dtype = tl.float32 if triton_dtype == tl.bfloat16 else triton_dtype
269
+ # Allocate gradient tensors
270
+ DX = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
256
271
 
257
272
  kernel_args = {"num_warps": num_warps}
258
273
  # XPU-specific optimization
@@ -260,28 +275,33 @@ def layer_norm_backward(dY, X, W, B, Mean, RSTD):
260
275
  kernel_args.update({"grf_mode": "large", "num_warps": 32, "num_stages": 4})
261
276
 
262
277
  # Launch kernel with one thread block per row for optimal performance
263
- grid = (n_rows,)
264
278
  _layer_norm_backward_kernel[grid](
265
279
  X,
280
+ X.stride(0),
266
281
  W,
267
282
  Mean,
283
+ Mean.stride(0),
268
284
  RSTD,
285
+ RSTD.stride(0),
269
286
  DX,
270
- DW,
271
- DB,
272
- dY,
273
- X.stride(0),
274
287
  DX.stride(0),
288
+ _DW,
289
+ _DW.stride(0),
290
+ _DB,
291
+ _DB.stride(0),
292
+ dY,
275
293
  dY.stride(0),
294
+ n_rows,
276
295
  n_cols,
296
+ rows_per_program=rows_per_program,
277
297
  BLOCK_SIZE=BLOCK_SIZE,
278
- dtype=triton_dtype,
279
- atomic_dtype=atomic_dtype,
280
298
  **kernel_args,
281
299
  )
282
300
 
283
301
  DX = DX.view(*shape)
284
- return DX, DW.to(W.dtype), DB.to(W.dtype)
302
+ DW = _DW.sum(dim=0).to(W.dtype)
303
+ DB = _DB.sum(dim=0).to(B.dtype)
304
+ return DX, DW, DB
285
305
 
286
306
 
287
307
  class LigerLayerNormFunction(torch.autograd.Function):
@@ -7,8 +7,10 @@ import triton.language as tl
7
7
  from liger_kernel.ops.utils import calculate_settings
8
8
  from liger_kernel.ops.utils import compare_version
9
9
  from liger_kernel.ops.utils import ensure_contiguous
10
+ from liger_kernel.utils import get_npu_multi_processor_count
11
+ from liger_kernel.utils import is_npu_available
10
12
 
11
- if compare_version("triton", operator.ge, "3.0.0"):
13
+ if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
12
14
  try:
13
15
  from triton.language.extra.libdevice import rsqrt
14
16
  except ModuleNotFoundError:
@@ -290,6 +292,8 @@ def poly_norm_backward(dY, X, W, RSTD, BLOCK_SIZE, num_warps, in_place):
290
292
  sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
291
293
  elif X.device.type == "xpu":
292
294
  sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
295
+ elif X.device.type == "npu":
296
+ sm_count = get_npu_multi_processor_count()
293
297
 
294
298
  # Allocate or reuse gradients
295
299
  if in_place is True:
@@ -21,8 +21,10 @@ from liger_kernel.ops.utils import calculate_settings
21
21
  from liger_kernel.ops.utils import compare_version
22
22
  from liger_kernel.ops.utils import ensure_contiguous
23
23
  from liger_kernel.ops.utils import torch_to_triton_dtype
24
+ from liger_kernel.utils import get_npu_multi_processor_count
25
+ from liger_kernel.utils import is_npu_available
24
26
 
25
- if compare_version("triton", operator.ge, "3.0.0"):
27
+ if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
26
28
  try:
27
29
  # typical import path with dispatch available
28
30
  from triton.language.extra.libdevice import rsqrt
@@ -349,7 +351,8 @@ def _block_rms_norm_backward_kernel(
349
351
 
350
352
  # calculate the gradient of W
351
353
  if casting_mode == _CASTING_MODE_LLAMA:
352
- dW_row += tl.sum(dY_row * (X_row * rstd_row[:, None]).to(X_dtype), 0)
354
+ # TODO(tcc): use tl.sum(..., dtype=tl.float32) once we upgrade to triton>=3.3.0
355
+ dW_row += tl.sum((dY_row * (X_row * rstd_row[:, None]).to(X_dtype)).to(tl.float32), 0)
353
356
  else:
354
357
  # here X_row is already in fp32 (see previous if block)
355
358
  dW_row += tl.sum(dY_row * (X_row * rstd_row[:, None]), 0)
@@ -449,6 +452,8 @@ def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warp
449
452
  sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
450
453
  elif X.device.type == "xpu":
451
454
  sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
455
+ elif X.device.type == "npu":
456
+ sm_count = get_npu_multi_processor_count()
452
457
 
453
458
  # fp32 for numerical stability especially.
454
459
  _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
@@ -78,6 +78,8 @@ def get_amp_custom_fwd_bwd() -> Callable:
78
78
  functools.partial(torch.amp.custom_fwd, device_type=device),
79
79
  functools.partial(torch.amp.custom_bwd, device_type=device),
80
80
  )
81
+ if hasattr(torch, "npu") and getattr(torch.npu, "amp", None) is not None:
82
+ return torch.npu.amp.custom_fwd, torch.npu.amp.custom_bwd
81
83
  return torch.cuda.amp.custom_fwd, torch.cuda.amp.custom_bwd
82
84
 
83
85