liger-kernel-nightly 0.6.2.dev20250903164350__tar.gz → 0.6.2.dev20250903164435__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (284) hide show
  1. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/PKG-INFO +1 -1
  2. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/data/all_benchmark_data.csv +64 -0
  3. liger_kernel_nightly-0.6.2.dev20250903164435/benchmark/scripts/benchmark_grpo_loss.py +234 -0
  4. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/pyproject.toml +1 -1
  5. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +4 -0
  6. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/grpo_loss.py +38 -4
  7. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel_nightly.egg-info/PKG-INFO +1 -1
  8. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel_nightly.egg-info/SOURCES.txt +1 -0
  9. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/chunked_loss/test_grpo_loss.py +35 -4
  10. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  11. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  12. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/.github/pull_request_template.md +0 -0
  13. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/.github/workflows/amd-ci.yml +0 -0
  14. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/.github/workflows/benchmark.yml +0 -0
  15. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/.github/workflows/docs.yml +0 -0
  16. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/.github/workflows/intel-ci.yml +0 -0
  17. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/.github/workflows/nvi-ci.yml +0 -0
  18. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/.github/workflows/publish-nightly.yml +0 -0
  19. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/.github/workflows/publish-release.yml +0 -0
  20. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/.gitignore +0 -0
  21. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/LICENSE +0 -0
  22. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/Makefile +0 -0
  23. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/NOTICE +0 -0
  24. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/README.md +0 -0
  25. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/README.md +0 -0
  26. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/__init__.py +0 -0
  27. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/benchmarks_visualizer.py +0 -0
  28. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/__init__.py +0 -0
  29. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
  30. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
  31. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
  32. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
  33. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
  34. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_dyt.py +0 -0
  35. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_embedding.py +0 -0
  36. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_fused_add_rms_norm.py +0 -0
  37. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
  38. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
  39. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
  40. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_geglu.py +0 -0
  41. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_group_norm.py +0 -0
  42. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_jsd.py +0 -0
  43. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_kl_div.py +0 -0
  44. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_kto_loss.py +0 -0
  45. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_layer_norm.py +0 -0
  46. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_llama4_rope.py +0 -0
  47. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
  48. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
  49. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
  50. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_rms_norm.py +0 -0
  51. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_rope.py +0 -0
  52. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
  53. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_softmax.py +0 -0
  54. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
  55. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_sparsemax.py +0 -0
  56. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_swiglu.py +0 -0
  57. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/benchmark_tvd.py +0 -0
  58. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/benchmark/scripts/utils.py +0 -0
  59. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/dev/fmt-requirements.txt +0 -0
  60. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/dev/modal/benchmarks.py +0 -0
  61. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/dev/modal/tests.py +0 -0
  62. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/dev/modal/tests_bwd.py +0 -0
  63. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/Examples.md +0 -0
  64. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/Getting-Started.md +0 -0
  65. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/High-Level-APIs.md +0 -0
  66. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/Low-Level-APIs.md +0 -0
  67. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/acknowledgement.md +0 -0
  68. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/contributing.md +0 -0
  69. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/images/banner.GIF +0 -0
  70. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/images/compose.gif +0 -0
  71. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/images/e2e-memory.png +0 -0
  72. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/images/e2e-tps.png +0 -0
  73. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/images/logo-banner.png +0 -0
  74. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/images/patch.gif +0 -0
  75. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/images/post-training.png +0 -0
  76. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/index.md +0 -0
  77. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/docs/license.md +0 -0
  78. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/alignment/accelerate_config.yaml +0 -0
  79. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/alignment/run_orpo.py +0 -0
  80. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/README.md +0 -0
  81. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/callback.py +0 -0
  82. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/config/fsdp_config.json +0 -0
  83. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/img/gemma_7b_mem.png +0 -0
  84. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/img/gemma_7b_tp.png +0 -0
  85. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/img/llama_mem_alloc.png +0 -0
  86. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/img/llama_tps.png +0 -0
  87. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
  88. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/img/qwen_tps.png +0 -0
  89. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/launch_on_modal.py +0 -0
  90. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/requirements.txt +0 -0
  91. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/run_benchmarks.sh +0 -0
  92. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/run_gemma.sh +0 -0
  93. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/run_llama.sh +0 -0
  94. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/run_qwen.sh +0 -0
  95. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/run_qwen2_vl.sh +0 -0
  96. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/training.py +0 -0
  97. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/huggingface/training_multimodal.py +0 -0
  98. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/lightning/README.md +0 -0
  99. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/lightning/requirements.txt +0 -0
  100. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/lightning/training.py +0 -0
  101. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/README.md +0 -0
  102. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/callback.py +0 -0
  103. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
  104. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
  105. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
  106. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
  107. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
  108. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
  109. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
  110. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
  111. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
  112. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/medusa_util.py +0 -0
  113. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/requirements.txt +0 -0
  114. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
  115. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/examples/medusa/train.py +0 -0
  116. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/licenses/LICENSE-Apache-2.0 +0 -0
  117. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/licenses/LICENSE-MIT-AutoAWQ +0 -0
  118. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
  119. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/licenses/LICENSE-MIT-llmc +0 -0
  120. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/licenses/LICENSE-MIT-triton +0 -0
  121. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/mkdocs.yml +0 -0
  122. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/setup.cfg +0 -0
  123. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/setup.py +0 -0
  124. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/__init__.py +0 -0
  125. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/README.md +0 -0
  126. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/__init__.py +0 -0
  127. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
  128. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
  129. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
  130. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/functional.py +0 -0
  131. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
  132. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
  133. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
  134. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
  135. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
  136. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
  137. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
  138. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/env_report.py +0 -0
  139. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/__init__.py +0 -0
  140. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/cross_entropy.py +0 -0
  141. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/dyt.py +0 -0
  142. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  143. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
  144. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/fused_add_rms_norm.py +0 -0
  145. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
  146. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
  147. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
  148. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/geglu.py +0 -0
  149. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/group_norm.py +0 -0
  150. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/grpo_loss.py +0 -0
  151. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/jsd.py +0 -0
  152. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/kl_div.py +0 -0
  153. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/layer_norm.py +0 -0
  154. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/llama4_rope.py +0 -0
  155. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/multi_token_attention.py +0 -0
  156. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
  157. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/rms_norm.py +0 -0
  158. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/rope.py +0 -0
  159. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/softmax.py +0 -0
  160. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/sparsemax.py +0 -0
  161. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/swiglu.py +0 -0
  162. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/tvd.py +0 -0
  163. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/ops/utils.py +0 -0
  164. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/__init__.py +0 -0
  165. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/auto_model.py +0 -0
  166. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/cross_entropy.py +0 -0
  167. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/dyt.py +0 -0
  168. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/experimental/__init__.py +0 -0
  169. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  170. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/fsdp.py +0 -0
  171. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/functional.py +0 -0
  172. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/fused_add_rms_norm.py +0 -0
  173. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
  174. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
  175. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
  176. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/geglu.py +0 -0
  177. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/group_norm.py +0 -0
  178. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/grpo_loss.py +0 -0
  179. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/jsd.py +0 -0
  180. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/kl_div.py +0 -0
  181. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/layer_norm.py +0 -0
  182. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/llama4_rope.py +0 -0
  183. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/__init__.py +0 -0
  184. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/gemma.py +0 -0
  185. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/gemma2.py +0 -0
  186. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/gemma3.py +0 -0
  187. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/glm4.py +0 -0
  188. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/glm4v.py +0 -0
  189. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/llama.py +0 -0
  190. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/llama4.py +0 -0
  191. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/llava.py +0 -0
  192. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
  193. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/mistral.py +0 -0
  194. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/mixtral.py +0 -0
  195. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/mllama.py +0 -0
  196. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/olmo2.py +0 -0
  197. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/paligemma.py +0 -0
  198. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/phi3.py +0 -0
  199. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/qwen2.py +0 -0
  200. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
  201. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
  202. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/qwen3.py +0 -0
  203. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
  204. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/model/smollm3.py +0 -0
  205. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/monkey_patch.py +0 -0
  206. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
  207. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
  208. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/rms_norm.py +0 -0
  209. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/rope.py +0 -0
  210. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/softmax.py +0 -0
  211. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/sparsemax.py +0 -0
  212. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/swiglu.py +0 -0
  213. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
  214. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
  215. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  216. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/transformers/tvd.py +0 -0
  217. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/triton/__init__.py +0 -0
  218. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/triton/monkey_patch.py +0 -0
  219. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel/utils.py +0 -0
  220. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
  221. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
  222. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
  223. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/__init__.py +0 -0
  224. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/chunked_loss/__init__.py +0 -0
  225. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/chunked_loss/test_cosine_loss.py +0 -0
  226. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/chunked_loss/test_cpo_loss.py +0 -0
  227. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/chunked_loss/test_dpo_loss.py +0 -0
  228. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/chunked_loss/test_jsd_loss.py +0 -0
  229. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/chunked_loss/test_kto_loss.py +0 -0
  230. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/chunked_loss/test_orpo_loss.py +0 -0
  231. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/chunked_loss/test_simpo_loss.py +0 -0
  232. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/conftest.py +0 -0
  233. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/convergence/__init__.py +0 -0
  234. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/convergence/bf16/__init__.py +0 -0
  235. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/convergence/bf16/test_mini_models.py +0 -0
  236. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/convergence/bf16/test_mini_models_multimodal.py +0 -0
  237. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/convergence/bf16/test_mini_models_with_logits.py +0 -0
  238. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/convergence/fp32/__init__.py +0 -0
  239. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/convergence/fp32/test_mini_models.py +0 -0
  240. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
  241. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/convergence/fp32/test_mini_models_with_logits.py +0 -0
  242. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
  243. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
  244. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
  245. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
  246. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
  247. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
  248. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
  249. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
  250. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
  251. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
  252. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/resources/tiny_shakespeare.txt +0 -0
  253. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
  254. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
  255. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
  256. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_auto_model.py +0 -0
  257. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_cross_entropy.py +0 -0
  258. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_dyt.py +0 -0
  259. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_embedding.py +0 -0
  260. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_flex_attention.py +0 -0
  261. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_fused_add_rms_norm.py +0 -0
  262. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
  263. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_fused_linear_jsd.py +0 -0
  264. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_fused_neighborhood_attention.py +0 -0
  265. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_geglu.py +0 -0
  266. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_group_norm.py +0 -0
  267. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_grpo_loss.py +0 -0
  268. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_jsd.py +0 -0
  269. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_kl_div.py +0 -0
  270. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_layer_norm.py +0 -0
  271. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_mm_int8int2.py +0 -0
  272. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_monkey_patch.py +0 -0
  273. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_multi_token_attention.py +0 -0
  274. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_qwen2vl_mrope.py +0 -0
  275. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_rms_norm.py +0 -0
  276. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_rope.py +0 -0
  277. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_softmax.py +0 -0
  278. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_sparsemax.py +0 -0
  279. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_swiglu.py +0 -0
  280. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_trainer_integration.py +0 -0
  281. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_transformers.py +0 -0
  282. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/transformers/test_tvd.py +0 -0
  283. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/triton/test_triton_monkey_patch.py +0 -0
  284. {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250903164435}/test/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.6.2.dev20250903164350
3
+ Version: 0.6.2.dev20250903164435
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -1575,6 +1575,70 @@ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,4096,416.11767578
1575
1575
  fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,8192,832.22705078125,832.22705078125,832.22705078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1576
1576
  fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,16384,1544.44580078125,1544.44580078125,1544.44580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1577
1577
  fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
1578
+ fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,2,40.75366401672363,40.749671173095706,40.75765686035156,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
1579
+ fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,4,80.95231628417969,80.95231628417969,80.95231628417969,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
1580
+ fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,8,163.58604431152344,163.58604431152344,163.58604431152344,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
1581
+ fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,16,323.6761474609375,323.6761474609375,323.6761474609375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
1582
+ fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,2,23.71225643157959,23.612825775146483,23.8354434967041,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
1583
+ fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,4,46.86131286621094,46.80355911254883,46.91906661987304,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
1584
+ fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,8,94.54898834228516,94.54898834228516,94.54898834228516,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
1585
+ fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,16,189.99501037597656,189.99501037597656,189.99501037597656,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
1586
+ fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,2,42.67263984680176,42.54085083007813,42.80442886352539,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
1587
+ fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,4,82.2446060180664,82.2446060180664,82.2446060180664,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
1588
+ fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,8,167.00416564941406,167.00416564941406,167.00416564941406,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
1589
+ fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,16,327.0911865234375,327.0911865234375,327.0911865234375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
1590
+ fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,2,45.36115264892578,45.241344451904304,45.480960845947266,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
1591
+ fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,4,90.00038146972656,90.00038146972656,90.00038146972656,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
1592
+ fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,8,177.22674560546875,177.22674560546875,177.22674560546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
1593
+ fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,16,356.5383605957031,356.5383605957031,356.5383605957031,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
1594
+ fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,2,1.814527988433838,1.8124799728393555,1.8167808055877686,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
1595
+ fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,4,1.84934401512146,1.8472959995269775,1.8524160385131836,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
1596
+ fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,8,1.891327977180481,1.8872319459915161,1.893990397453308,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
1597
+ fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,16,1.9722239971160889,1.9660799503326416,1.9763200283050537,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
1598
+ fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,2,22.014975547790527,21.710438537597657,22.19417533874512,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
1599
+ fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,4,41.83603096008301,41.752165222167974,41.91989669799805,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
1600
+ fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,8,81.66400146484375,81.66400146484375,81.66400146484375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
1601
+ fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,16,162.6429443359375,162.6429443359375,162.6429443359375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
1602
+ fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,2,7344.77685546875,7344.77685546875,7344.77685546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
1603
+ fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,4,7408.80029296875,7408.80029296875,7408.80029296875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
1604
+ fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,8,7536.84716796875,7536.84716796875,7536.84716796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
1605
+ fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,16,7792.94091796875,7792.94091796875,7792.94091796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
1606
+ fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,2,9083.28125,9083.28125,9083.28125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
1607
+ fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,4,13138.3125,13138.3125,13138.3125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
1608
+ fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,8,21250.375,21250.375,21250.375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
1609
+ fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,16,37474.5,37474.5,37474.5,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
1610
+ fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,2,40.72038269042969,40.71178131103516,40.728984069824214,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
1611
+ fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,4,81.69369506835938,81.69369506835938,81.69369506835938,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
1612
+ fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,8,162.79653930664062,162.79653930664062,162.79653930664062,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
1613
+ fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,16,323.6546630859375,323.6546630859375,323.6546630859375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
1614
+ fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,2,23.70047950744629,23.628594589233398,23.732429122924806,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
1615
+ fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,4,47.36921691894531,47.085364532470706,47.65306930541992,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
1616
+ fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,8,94.83366394042969,94.83366394042969,94.83366394042969,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
1617
+ fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,16,190.0963897705078,190.0963897705078,190.0963897705078,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
1618
+ fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,2,42.318336486816406,42.15214080810547,42.48453216552734,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
1619
+ fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,4,82.4616928100586,82.4616928100586,82.4616928100586,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
1620
+ fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,8,163.43756103515625,163.43756103515625,163.43756103515625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
1621
+ fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,16,325.4384765625,325.4384765625,325.4384765625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
1622
+ fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,2,45.99193572998047,45.80761489868165,46.176256561279295,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
1623
+ fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,4,88.57190704345703,88.57190704345703,88.57190704345703,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
1624
+ fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,8,176.94105529785156,176.94105529785156,176.94105529785156,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
1625
+ fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,16,356.0478820800781,356.0478820800781,356.0478820800781,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
1626
+ fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,2,1.8242560029029846,1.8102271556854248,1.8309119939804077,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
1627
+ fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,4,1.84934401512146,1.846886396408081,1.8534400463104248,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
1628
+ fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,8,1.891327977180481,1.8892799615859985,1.8933759927749634,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
1629
+ fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,16,1.9752960205078125,1.9722239971160889,1.977344036102295,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
1630
+ fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,2,22.0262393951416,21.80997085571289,22.20482559204102,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
1631
+ fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,4,41.54521560668945,41.224806213378905,41.865625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
1632
+ fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,8,81.21753692626953,81.21753692626953,81.21753692626953,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
1633
+ fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,16,160.82022094726562,160.82022094726562,160.82022094726562,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
1634
+ fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,2,7344.77685546875,7344.77685546875,7344.77685546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
1635
+ fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,4,7408.80029296875,7408.80029296875,7408.80029296875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
1636
+ fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,8,7536.84716796875,7536.84716796875,7536.84716796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
1637
+ fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,16,7792.94091796875,7792.94091796875,7792.94091796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
1638
+ fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,2,9083.28125,9083.28125,9083.28125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
1639
+ fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,4,13138.3125,13138.3125,13138.3125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
1640
+ fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,8,21250.375,21250.375,21250.375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
1641
+ fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,16,37474.5,37474.5,37474.5,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
1578
1642
  llama4_rope,liger,forward,speed,ms,H,hidden size,512,0.08249600231647491,0.08102399855852127,0.08432000130414963,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
1579
1643
  llama4_rope,liger,forward,speed,ms,H,hidden size,2048,0.08169600367546082,0.08037760108709335,0.08329600095748901,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
1580
1644
  llama4_rope,liger,forward,speed,ms,H,hidden size,8192,0.08128000050783157,0.07980799674987793,0.08329600095748901,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
@@ -0,0 +1,234 @@
1
+ import os
2
+ import sys
3
+
4
+ import torch
5
+ import triton
6
+
7
+ from utils import QUANTILES
8
+ from utils import SingleBenchmarkRunInput
9
+ from utils import SingleBenchmarkRunOutput
10
+ from utils import _test_memory
11
+ from utils import parse_benchmark_script_args
12
+ from utils import run_benchmarks
13
+
14
+ from liger_kernel.utils import infer_device
15
+
16
+ device = infer_device()
17
+
18
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
19
+
20
+
21
+ #############################################################################
22
+ # Test the memory consumption of the linear fused GRPO loss
23
+ #############################################################################
24
+
25
+
26
+ def bench_memory_fused_linear_grpo_loss(
27
+ input: SingleBenchmarkRunInput,
28
+ ) -> SingleBenchmarkRunOutput:
29
+ from test.chunked_loss.test_grpo_loss import LigerLMHeadGRPO
30
+ from test.chunked_loss.test_grpo_loss import TorchLMHeadGRPO
31
+
32
+ B = input.x
33
+ T = input.extra_benchmark_config["T"]
34
+ H = input.extra_benchmark_config["H"]
35
+ V = input.extra_benchmark_config["V"]
36
+ dtype = input.extra_benchmark_config["dtype"]
37
+ importance_sampling_level = input.extra_benchmark_config["importance_sampling_level"]
38
+ provider = input.kernel_provider
39
+
40
+ # Instantiate once and retrieve the first output only
41
+ torch_lm_head_grpo = TorchLMHeadGRPO(H=H, V=V, dtype=dtype, importance_sampling_level=importance_sampling_level).to(
42
+ device
43
+ )
44
+ liger_lm_head_grpo = LigerLMHeadGRPO(H=H, V=V, dtype=dtype, importance_sampling_level=importance_sampling_level).to(
45
+ device
46
+ )
47
+
48
+ # Create inputs
49
+ _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
50
+ selected_token_ids = torch.randint(0, V, (B, T), dtype=torch.long, device=device)
51
+ attention_mask = torch.ones(B, T, device=device)
52
+ advantages = torch.randn(B, dtype=dtype, device=device)
53
+ ref_input = torch.randn(B, T, H, dtype=dtype, device=device)
54
+
55
+ torch_fwd = lambda: torch_lm_head_grpo(_input, selected_token_ids, attention_mask, advantages, ref_input=ref_input)[
56
+ 0
57
+ ]
58
+ liger_fwd = lambda: liger_lm_head_grpo(_input, selected_token_ids, attention_mask, advantages, ref_input=ref_input)[
59
+ 0
60
+ ]
61
+
62
+ def fwd():
63
+ if provider == "liger":
64
+ return liger_fwd()
65
+ elif provider == "torch":
66
+ return torch_fwd()
67
+
68
+ def full():
69
+ y = fwd()
70
+ y.backward()
71
+
72
+ mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
73
+ return SingleBenchmarkRunOutput(
74
+ y_20=mem_20,
75
+ y_50=mem_50,
76
+ y_80=mem_80,
77
+ )
78
+
79
+
80
+ #############################################################################
81
+ # Test the speed of the fused linear GRPO loss
82
+ #############################################################################
83
+
84
+
85
+ def bench_speed_fused_linear_grpo_loss(
86
+ input: SingleBenchmarkRunInput,
87
+ ) -> SingleBenchmarkRunOutput:
88
+ from test.chunked_loss.test_grpo_loss import LigerLMHeadGRPO
89
+ from test.chunked_loss.test_grpo_loss import TorchLMHeadGRPO
90
+
91
+ B = input.x
92
+ T = input.extra_benchmark_config["T"]
93
+ H = input.extra_benchmark_config["H"]
94
+ V = input.extra_benchmark_config["V"]
95
+ dtype = input.extra_benchmark_config["dtype"]
96
+ importance_sampling_level = input.extra_benchmark_config["importance_sampling_level"]
97
+ provider = input.kernel_provider
98
+ mode = input.kernel_operation_mode
99
+
100
+ # Instantiate once and retrieve the first output only
101
+ torch_lm_head_grpo = TorchLMHeadGRPO(H=H, V=V, dtype=dtype, importance_sampling_level=importance_sampling_level).to(
102
+ device
103
+ )
104
+ liger_lm_head_grpo = LigerLMHeadGRPO(H=H, V=V, dtype=dtype, importance_sampling_level=importance_sampling_level).to(
105
+ device
106
+ )
107
+
108
+ # Create inputs
109
+ _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
110
+ selected_token_ids = torch.randint(0, V, (B, T), dtype=torch.long, device=device)
111
+ attention_mask = torch.ones(B, T, device=device)
112
+ advantages = torch.randn(B, dtype=dtype, device=device)
113
+ ref_input = torch.randn(B, T, H, dtype=dtype, device=device)
114
+
115
+ torch_fwd = lambda: torch_lm_head_grpo(_input, selected_token_ids, attention_mask, advantages, ref_input=ref_input)[
116
+ 0
117
+ ]
118
+ liger_fwd = lambda: liger_lm_head_grpo(_input, selected_token_ids, attention_mask, advantages, ref_input=ref_input)[
119
+ 0
120
+ ]
121
+
122
+ def fwd():
123
+ if provider == "liger":
124
+ return liger_fwd()
125
+ elif provider == "torch":
126
+ return torch_fwd()
127
+
128
+ if mode == "forward":
129
+ ms_50, ms_20, ms_80 = triton.testing.do_bench(
130
+ fwd,
131
+ rep=100,
132
+ quantiles=QUANTILES,
133
+ )
134
+ elif mode == "backward":
135
+ y = fwd()
136
+
137
+ ms_50, ms_20, ms_80 = triton.testing.do_bench(
138
+ lambda: y.backward(retain_graph=True),
139
+ grad_to_none=[_input],
140
+ rep=100,
141
+ quantiles=QUANTILES,
142
+ )
143
+ elif mode == "full":
144
+
145
+ def full():
146
+ y = fwd()
147
+ y.backward()
148
+
149
+ ms_50, ms_20, ms_80 = triton.testing.do_bench(
150
+ full,
151
+ rep=100,
152
+ quantiles=QUANTILES,
153
+ )
154
+ return SingleBenchmarkRunOutput(
155
+ y_20=ms_20,
156
+ y_50=ms_50,
157
+ y_80=ms_80,
158
+ )
159
+
160
+
161
+ if __name__ == "__main__":
162
+ args = parse_benchmark_script_args()
163
+
164
+ # Benchmark token-level importance sampling (original GRPO)
165
+ token_configs = {
166
+ "kernel_name": "fused_linear_grpo_loss_token",
167
+ "x_name": "B",
168
+ "x_label": "B",
169
+ "x_values": [2**i for i in range(1, 5)],
170
+ "kernel_providers": ["liger", "torch"],
171
+ "extra_benchmark_configs": [
172
+ {
173
+ "T": 1024,
174
+ "H": 4096,
175
+ "V": 128256,
176
+ "importance_sampling_level": "token",
177
+ "dtype": torch.bfloat16,
178
+ }
179
+ ],
180
+ "overwrite": args.overwrite,
181
+ }
182
+
183
+ # Benchmark sequence-level importance sampling (GSPO)
184
+ sequence_configs = {
185
+ "kernel_name": "fused_linear_grpo_loss_sequence",
186
+ "x_name": "B",
187
+ "x_label": "B",
188
+ "x_values": [2**i for i in range(1, 5)],
189
+ "kernel_providers": ["liger", "torch"],
190
+ "extra_benchmark_configs": [
191
+ {
192
+ "T": 1024,
193
+ "H": 4096,
194
+ "V": 128256,
195
+ "importance_sampling_level": "sequence",
196
+ "dtype": torch.bfloat16,
197
+ }
198
+ ],
199
+ "overwrite": args.overwrite,
200
+ }
201
+
202
+ # Run benchmarks for token-level (GRPO)
203
+ print("Benchmarking GRPO (token-level importance sampling)...")
204
+ run_benchmarks(
205
+ bench_test_fn=bench_speed_fused_linear_grpo_loss,
206
+ kernel_operation_modes=["forward", "full", "backward"],
207
+ metric_name="speed",
208
+ metric_unit="ms",
209
+ **token_configs,
210
+ )
211
+ run_benchmarks(
212
+ bench_test_fn=bench_memory_fused_linear_grpo_loss,
213
+ kernel_operation_modes=["full"],
214
+ metric_name="memory",
215
+ metric_unit="MB",
216
+ **token_configs,
217
+ )
218
+
219
+ # Run benchmarks for sequence-level (GSPO)
220
+ print("Benchmarking GSPO (sequence-level importance sampling)...")
221
+ run_benchmarks(
222
+ bench_test_fn=bench_speed_fused_linear_grpo_loss,
223
+ kernel_operation_modes=["forward", "full", "backward"],
224
+ metric_name="speed",
225
+ metric_unit="ms",
226
+ **sequence_configs,
227
+ )
228
+ run_benchmarks(
229
+ bench_test_fn=bench_memory_fused_linear_grpo_loss,
230
+ kernel_operation_modes=["full"],
231
+ metric_name="memory",
232
+ metric_unit="MB",
233
+ **sequence_configs,
234
+ )
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "liger_kernel_nightly"
7
- version = "0.6.2.dev20250903164350"
7
+ version = "0.6.2.dev20250903164435"
8
8
  description = "Efficient Triton kernels for LLM Training"
9
9
  urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -34,6 +34,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
34
34
  beta=0.04,
35
35
  loss_type="bnpo",
36
36
  max_completion_length=None,
37
+ importance_sampling_level="token",
37
38
  temperature=1.0,
38
39
  compiled=True,
39
40
  use_ref_model=False,
@@ -92,6 +93,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
92
93
  beta=beta,
93
94
  loss_type=loss_type,
94
95
  max_completion_length=max_completion_length,
96
+ importance_sampling_level=importance_sampling_level,
95
97
  temperature=temperature,
96
98
  use_ref_model=use_ref_model,
97
99
  ppo_loss_fn=cls.ppo_loss_fn,
@@ -261,6 +263,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
261
263
  beta=0.04,
262
264
  loss_type="bnpo",
263
265
  max_completion_length=None,
266
+ importance_sampling_level="token",
264
267
  temperature=1.0,
265
268
  use_ref_model=False,
266
269
  ppo_loss_fn=None,
@@ -292,6 +295,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
292
295
  beta=beta,
293
296
  loss_type=loss_type,
294
297
  max_completion_length=max_completion_length,
298
+ importance_sampling_level=importance_sampling_level,
295
299
  )
296
300
 
297
301
  return chunk_loss, chunk_metrics
@@ -31,6 +31,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
31
31
  beta=0.04,
32
32
  loss_type="bnpo", # ["grpo", "bnpo", "dr_grpo"]
33
33
  max_completion_length=None, # Required for dr_grpo
34
+ importance_sampling_level="token", # ["token", "sequence"] - new parameter for GSPO
34
35
  **kwargs,
35
36
  ):
36
37
  """GRPO Loss Function matching GRPOTrainer implementation."""
@@ -50,7 +51,22 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
50
51
 
51
52
  # Compute policy gradient loss with importance sampling ratio
52
53
  old_per_token_logps = old_per_token_logps if old_per_token_logps is not None else per_token_logps.detach()
53
- coef_1 = torch.exp(per_token_logps - old_per_token_logps)
54
+ log_ratio = per_token_logps - old_per_token_logps
55
+
56
+ if importance_sampling_level == "token":
57
+ log_importance_weights = log_ratio
58
+ elif importance_sampling_level == "sequence":
59
+ log_importance_weights = (log_ratio * attention_mask).sum(-1) / attention_mask.sum(-1).clamp(min=1.0)
60
+ log_importance_weights = log_importance_weights.unsqueeze(-1)
61
+ else:
62
+ raise ValueError(
63
+ f"Unknown importance sampling level: {importance_sampling_level}. Possible values are 'token' "
64
+ "and 'sequence'."
65
+ )
66
+
67
+ # From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
68
+ # importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
69
+ coef_1 = torch.exp(log_importance_weights)
54
70
  coef_2 = clip_coef_fn(coef_1, epsilon_low, epsilon_high)
55
71
  per_token_loss1 = coef_1 * advantages.unsqueeze(1)
56
72
  per_token_loss2 = coef_2 * advantages.unsqueeze(1)
@@ -85,9 +101,19 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
85
101
  metrics = []
86
102
  if beta != 0.0:
87
103
  metrics.append(((kl_div * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)))
88
- is_clipped = ((coef_1 < 1 - epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
89
- (coef_1 > 1 + epsilon_high) & (advantages.unsqueeze(1) > 0)
90
- )
104
+
105
+ # Adjust clipping metric calculation based on importance sampling level
106
+ if importance_sampling_level == "token":
107
+ is_clipped = ((coef_1 < 1 - epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
108
+ (coef_1 > 1 + epsilon_high) & (advantages.unsqueeze(1) > 0)
109
+ )
110
+ else: # sequence level
111
+ # For sequence level, coef_1 is shape (B, 1), advantages is shape (B,)
112
+ is_clipped = ((coef_1.squeeze(-1) < 1 - epsilon_low) & (advantages < 0)) | (
113
+ (coef_1.squeeze(-1) > 1 + epsilon_high) & (advantages > 0)
114
+ )
115
+ is_clipped = is_clipped.unsqueeze(1).expand_as(attention_mask)
116
+
91
117
  metrics.append((is_clipped * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0))
92
118
  return loss, metrics
93
119
 
@@ -111,6 +137,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
111
137
  epsilon_high=0.2,
112
138
  loss_type="bnpo",
113
139
  max_completion_length=None,
140
+ importance_sampling_level="token",
114
141
  temperature=1.0,
115
142
  compiled=True,
116
143
  use_ref_model=True,
@@ -132,6 +159,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
132
159
  beta (float): Weight for the KL penalty
133
160
  loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
134
161
  max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
162
+ importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
135
163
  temperature (float): Temperature for the logits
136
164
  compiled (bool): Whether to use torch compile
137
165
  use_ref_model (bool): Whether to use a reference model
@@ -162,6 +190,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
162
190
  compiled=compiled,
163
191
  use_ref_model=use_ref_model,
164
192
  chunk_size=chunk_size,
193
+ importance_sampling_level=importance_sampling_level,
165
194
  )
166
195
 
167
196
  @staticmethod
@@ -187,6 +216,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
187
216
  None, # grad_epsilon_high
188
217
  None, # grad_loss_type (string, not differentiable)
189
218
  None, # grad_max_completion_length (int, not differentiable)
219
+ None, # grad_importance_sampling_level (string, not differentiable)
190
220
  None, # grad_temperature
191
221
  None, # grad_compiled
192
222
  None, # grad_use_ref_model
@@ -207,6 +237,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
207
237
  epsilon_high: float = 0.2,
208
238
  loss_type: str = "bnpo",
209
239
  max_completion_length: Optional[int] = None,
240
+ importance_sampling_level: str = "token",
210
241
  temperature: float = 1.0,
211
242
  ):
212
243
  """
@@ -219,6 +250,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
219
250
  epsilon_high (float): Upper bound for the importance sampling ratio.
220
251
  loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
221
252
  max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
253
+ importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
222
254
  temperature (float): Temperature for the logits.
223
255
  """
224
256
  super().__init__()
@@ -230,6 +262,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
230
262
  self.epsilon_high = epsilon_high
231
263
  self.loss_type = loss_type
232
264
  self.max_completion_length = max_completion_length
265
+ self.importance_sampling_level = importance_sampling_level
233
266
  self.temperature = temperature
234
267
 
235
268
  def forward(
@@ -263,6 +296,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
263
296
  self.epsilon_high,
264
297
  self.loss_type,
265
298
  self.max_completion_length,
299
+ self.importance_sampling_level,
266
300
  self.temperature,
267
301
  self.compiled,
268
302
  self.use_ref_model,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.6.2.dev20250903164350
3
+ Version: 0.6.2.dev20250903164435
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -34,6 +34,7 @@ benchmark/scripts/benchmark_fused_linear_jsd.py
34
34
  benchmark/scripts/benchmark_fused_neighborhood_attention.py
35
35
  benchmark/scripts/benchmark_geglu.py
36
36
  benchmark/scripts/benchmark_group_norm.py
37
+ benchmark/scripts/benchmark_grpo_loss.py
37
38
  benchmark/scripts/benchmark_jsd.py
38
39
  benchmark/scripts/benchmark_kl_div.py
39
40
  benchmark/scripts/benchmark_kto_loss.py
@@ -29,6 +29,7 @@ class TorchLMHeadGRPO(torch.nn.Module):
29
29
  use_ref_model: bool = True,
30
30
  loss_type: str = "bnpo",
31
31
  max_completion_length: int | None = None,
32
+ importance_sampling_level: str = "token",
32
33
  ):
33
34
  super().__init__()
34
35
  self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
@@ -40,6 +41,7 @@ class TorchLMHeadGRPO(torch.nn.Module):
40
41
  self.use_ref_model = use_ref_model
41
42
  self.loss_type = loss_type
42
43
  self.max_completion_length = max_completion_length
44
+ self.importance_sampling_level = importance_sampling_level
43
45
  if self.loss_type == "dr_grpo":
44
46
  assert self.max_completion_length is not None, "max_completion_length must be provided for dr_grpo"
45
47
 
@@ -84,7 +86,20 @@ class TorchLMHeadGRPO(torch.nn.Module):
84
86
  old_per_token_logps = (
85
87
  old_per_token_logps.float() if old_per_token_logps is not None else per_token_logps.detach()
86
88
  )
87
- coef_1 = torch.exp(per_token_logps - old_per_token_logps)
89
+ log_ratio = per_token_logps - old_per_token_logps
90
+
91
+ if self.importance_sampling_level == "token":
92
+ log_importance_weights = log_ratio
93
+ elif self.importance_sampling_level == "sequence":
94
+ log_importance_weights = (log_ratio * attention_mask).sum(-1) / attention_mask.sum(-1).clamp(min=1.0)
95
+ log_importance_weights = log_importance_weights.unsqueeze(-1)
96
+ else:
97
+ raise ValueError(
98
+ f"Unknown importance sampling level: {self.importance_sampling_level}. Possible values are 'token' "
99
+ "and 'sequence'."
100
+ )
101
+
102
+ coef_1 = torch.exp(log_importance_weights)
88
103
  coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
89
104
  per_token_loss1 = coef_1 * advantages.unsqueeze(1)
90
105
  per_token_loss2 = coef_2 * advantages.unsqueeze(1)
@@ -109,9 +124,17 @@ class TorchLMHeadGRPO(torch.nn.Module):
109
124
  metrics = []
110
125
  if self.beta != 0.0:
111
126
  metrics.append(((kl_div * attention_mask).sum() / torch.clamp(attention_mask.sum(), min=1.0)))
112
- is_clipped = ((coef_1 < 1 - self.epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
113
- (coef_1 > 1 + self.epsilon_high) & (advantages.unsqueeze(1) > 0)
114
- )
127
+ # Adjust clipping metric calculation based on importance sampling level
128
+ if self.importance_sampling_level == "token":
129
+ is_clipped = ((coef_1 < 1 - self.epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
130
+ (coef_1 > 1 + self.epsilon_high) & (advantages.unsqueeze(1) > 0)
131
+ )
132
+ else: # sequence level
133
+ # For sequence level, coef_1 is shape (B, 1), advantages is shape (B,)
134
+ is_clipped = ((coef_1.squeeze(-1) < 1 - self.epsilon_low) & (advantages < 0)) | (
135
+ (coef_1.squeeze(-1) > 1 + self.epsilon_high) & (advantages > 0)
136
+ )
137
+ is_clipped = is_clipped.unsqueeze(1).expand_as(attention_mask)
115
138
  metrics.append((is_clipped * attention_mask).sum() / torch.clamp(attention_mask.sum(), min=1.0))
116
139
  return loss, metrics
117
140
 
@@ -130,6 +153,7 @@ class LigerLMHeadGRPO(torch.nn.Module):
130
153
  use_ref_model: bool = True,
131
154
  loss_type: str = "bnpo",
132
155
  max_completion_length: int | None = None,
156
+ importance_sampling_level: str = "token",
133
157
  ):
134
158
  super().__init__()
135
159
  self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
@@ -143,6 +167,7 @@ class LigerLMHeadGRPO(torch.nn.Module):
143
167
  compiled=True,
144
168
  loss_type=loss_type,
145
169
  max_completion_length=max_completion_length,
170
+ importance_sampling_level=importance_sampling_level,
146
171
  )
147
172
 
148
173
  def forward(
@@ -203,6 +228,7 @@ class LigerLMHeadGRPO(torch.nn.Module):
203
228
  ],
204
229
  )
205
230
  @pytest.mark.parametrize("loss_type", ["bnpo", "grpo", "dr_grpo"])
231
+ @pytest.mark.parametrize("importance_sampling_level", ["token", "sequence"])
206
232
  def test_correctness(
207
233
  B,
208
234
  T,
@@ -221,6 +247,7 @@ def test_correctness(
221
247
  use_ref_model,
222
248
  old_per_token_logps,
223
249
  loss_type,
250
+ importance_sampling_level,
224
251
  ):
225
252
  # Reset torch compiler cache for each parameter of the test case
226
253
  torch.compiler.reset()
@@ -238,6 +265,7 @@ def test_correctness(
238
265
  use_ref_model=use_ref_model,
239
266
  loss_type=loss_type,
240
267
  max_completion_length=max_completion_length,
268
+ importance_sampling_level=importance_sampling_level,
241
269
  )
242
270
  liger_lm_head_grpo = LigerLMHeadGRPO(
243
271
  H=H,
@@ -251,6 +279,7 @@ def test_correctness(
251
279
  use_ref_model=use_ref_model,
252
280
  loss_type=loss_type,
253
281
  max_completion_length=max_completion_length,
282
+ importance_sampling_level=importance_sampling_level,
254
283
  )
255
284
 
256
285
  # Initialize weights
@@ -445,6 +474,7 @@ def test_functional_correctness(
445
474
  0.2,
446
475
  "bnpo",
447
476
  max_completion_length,
477
+ "token",
448
478
  1.0,
449
479
  False,
450
480
  True,
@@ -468,6 +498,7 @@ def test_functional_correctness(
468
498
  0.2,
469
499
  "bnpo",
470
500
  max_completion_length,
501
+ "token",
471
502
  1.0,
472
503
  False,
473
504
  True,