liger-kernel-nightly 0.6.1.dev20250819173444__tar.gz → 0.6.2.dev20250822031319__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (283) hide show
  1. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/PKG-INFO +1 -1
  2. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/pyproject.toml +1 -1
  3. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/fused_linear_cross_entropy.py +41 -1
  4. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/functional.py +2 -0
  5. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +3 -0
  6. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel_nightly.egg-info/PKG-INFO +1 -1
  7. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_fused_linear_cross_entropy.py +226 -0
  8. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  9. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  10. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/.github/pull_request_template.md +0 -0
  11. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/.github/workflows/amd-ci.yml +0 -0
  12. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/.github/workflows/benchmark.yml +0 -0
  13. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/.github/workflows/docs.yml +0 -0
  14. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/.github/workflows/intel-ci.yml +0 -0
  15. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/.github/workflows/nvi-ci.yml +0 -0
  16. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/.github/workflows/publish-nightly.yml +0 -0
  17. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/.github/workflows/publish-release.yml +0 -0
  18. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/.gitignore +0 -0
  19. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/LICENSE +0 -0
  20. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/Makefile +0 -0
  21. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/NOTICE +0 -0
  22. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/README.md +0 -0
  23. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/README.md +0 -0
  24. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/__init__.py +0 -0
  25. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/benchmarks_visualizer.py +0 -0
  26. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/data/all_benchmark_data.csv +0 -0
  27. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/__init__.py +0 -0
  28. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
  29. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
  30. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
  31. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
  32. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
  33. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_dyt.py +0 -0
  34. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_embedding.py +0 -0
  35. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_fused_add_rms_norm.py +0 -0
  36. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
  37. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
  38. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
  39. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_geglu.py +0 -0
  40. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_group_norm.py +0 -0
  41. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_jsd.py +0 -0
  42. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_kl_div.py +0 -0
  43. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_kto_loss.py +0 -0
  44. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_layer_norm.py +0 -0
  45. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_llama4_rope.py +0 -0
  46. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
  47. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
  48. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
  49. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_rms_norm.py +0 -0
  50. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_rope.py +0 -0
  51. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
  52. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_softmax.py +0 -0
  53. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
  54. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_sparsemax.py +0 -0
  55. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_swiglu.py +0 -0
  56. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/benchmark_tvd.py +0 -0
  57. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/benchmark/scripts/utils.py +0 -0
  58. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/dev/fmt-requirements.txt +0 -0
  59. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/dev/modal/benchmarks.py +0 -0
  60. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/dev/modal/tests.py +0 -0
  61. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/dev/modal/tests_bwd.py +0 -0
  62. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/Examples.md +0 -0
  63. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/Getting-Started.md +0 -0
  64. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/High-Level-APIs.md +0 -0
  65. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/Low-Level-APIs.md +0 -0
  66. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/acknowledgement.md +0 -0
  67. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/contributing.md +0 -0
  68. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/images/banner.GIF +0 -0
  69. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/images/compose.gif +0 -0
  70. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/images/e2e-memory.png +0 -0
  71. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/images/e2e-tps.png +0 -0
  72. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/images/logo-banner.png +0 -0
  73. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/images/patch.gif +0 -0
  74. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/images/post-training.png +0 -0
  75. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/index.md +0 -0
  76. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/docs/license.md +0 -0
  77. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/alignment/accelerate_config.yaml +0 -0
  78. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/alignment/run_orpo.py +0 -0
  79. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/README.md +0 -0
  80. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/callback.py +0 -0
  81. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/config/fsdp_config.json +0 -0
  82. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/img/gemma_7b_mem.png +0 -0
  83. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/img/gemma_7b_tp.png +0 -0
  84. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/img/llama_mem_alloc.png +0 -0
  85. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/img/llama_tps.png +0 -0
  86. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
  87. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/img/qwen_tps.png +0 -0
  88. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/launch_on_modal.py +0 -0
  89. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/requirements.txt +0 -0
  90. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/run_benchmarks.sh +0 -0
  91. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/run_gemma.sh +0 -0
  92. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/run_llama.sh +0 -0
  93. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/run_qwen.sh +0 -0
  94. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/run_qwen2_vl.sh +0 -0
  95. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/training.py +0 -0
  96. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/huggingface/training_multimodal.py +0 -0
  97. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/lightning/README.md +0 -0
  98. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/lightning/requirements.txt +0 -0
  99. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/lightning/training.py +0 -0
  100. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/README.md +0 -0
  101. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/callback.py +0 -0
  102. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
  103. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
  104. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
  105. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
  106. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
  107. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
  108. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
  109. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
  110. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
  111. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/medusa_util.py +0 -0
  112. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/requirements.txt +0 -0
  113. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
  114. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/examples/medusa/train.py +0 -0
  115. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/licenses/LICENSE-Apache-2.0 +0 -0
  116. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/licenses/LICENSE-MIT-AutoAWQ +0 -0
  117. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
  118. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/licenses/LICENSE-MIT-llmc +0 -0
  119. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/licenses/LICENSE-MIT-triton +0 -0
  120. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/mkdocs.yml +0 -0
  121. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/setup.cfg +0 -0
  122. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/setup.py +0 -0
  123. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/__init__.py +0 -0
  124. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/README.md +0 -0
  125. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/__init__.py +0 -0
  126. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
  127. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
  128. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
  129. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/functional.py +0 -0
  130. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
  131. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
  132. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
  133. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
  134. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
  135. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
  136. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
  137. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
  138. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
  139. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/env_report.py +0 -0
  140. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/__init__.py +0 -0
  141. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/cross_entropy.py +0 -0
  142. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/dyt.py +0 -0
  143. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  144. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
  145. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/fused_add_rms_norm.py +0 -0
  146. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
  147. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
  148. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/geglu.py +0 -0
  149. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/group_norm.py +0 -0
  150. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/grpo_loss.py +0 -0
  151. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/jsd.py +0 -0
  152. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/kl_div.py +0 -0
  153. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/layer_norm.py +0 -0
  154. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/llama4_rope.py +0 -0
  155. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/multi_token_attention.py +0 -0
  156. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
  157. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/rms_norm.py +0 -0
  158. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/rope.py +0 -0
  159. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/softmax.py +0 -0
  160. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/sparsemax.py +0 -0
  161. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/swiglu.py +0 -0
  162. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/tvd.py +0 -0
  163. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/ops/utils.py +0 -0
  164. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/__init__.py +0 -0
  165. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/auto_model.py +0 -0
  166. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/cross_entropy.py +0 -0
  167. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/dyt.py +0 -0
  168. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/experimental/__init__.py +0 -0
  169. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  170. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/fsdp.py +0 -0
  171. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/fused_add_rms_norm.py +0 -0
  172. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
  173. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
  174. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/geglu.py +0 -0
  175. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/group_norm.py +0 -0
  176. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/grpo_loss.py +0 -0
  177. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/jsd.py +0 -0
  178. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/kl_div.py +0 -0
  179. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/layer_norm.py +0 -0
  180. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/llama4_rope.py +0 -0
  181. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/__init__.py +0 -0
  182. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/gemma.py +0 -0
  183. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/gemma2.py +0 -0
  184. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/gemma3.py +0 -0
  185. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/glm4.py +0 -0
  186. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/glm4v.py +0 -0
  187. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/llama.py +0 -0
  188. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/llama4.py +0 -0
  189. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/llava.py +0 -0
  190. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
  191. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/mistral.py +0 -0
  192. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/mixtral.py +0 -0
  193. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/mllama.py +0 -0
  194. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/olmo2.py +0 -0
  195. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/paligemma.py +0 -0
  196. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/phi3.py +0 -0
  197. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/qwen2.py +0 -0
  198. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
  199. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
  200. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/qwen3.py +0 -0
  201. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
  202. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/model/smollm3.py +0 -0
  203. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/monkey_patch.py +0 -0
  204. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
  205. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
  206. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/rms_norm.py +0 -0
  207. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/rope.py +0 -0
  208. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/softmax.py +0 -0
  209. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/sparsemax.py +0 -0
  210. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/swiglu.py +0 -0
  211. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
  212. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
  213. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  214. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/transformers/tvd.py +0 -0
  215. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/triton/__init__.py +0 -0
  216. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/triton/monkey_patch.py +0 -0
  217. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel/utils.py +0 -0
  218. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel_nightly.egg-info/SOURCES.txt +0 -0
  219. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
  220. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
  221. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
  222. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/__init__.py +0 -0
  223. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/chunked_loss/__init__.py +0 -0
  224. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/chunked_loss/test_cosine_loss.py +0 -0
  225. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/chunked_loss/test_cpo_loss.py +0 -0
  226. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/chunked_loss/test_dpo_loss.py +0 -0
  227. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/chunked_loss/test_grpo_loss.py +0 -0
  228. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/chunked_loss/test_jsd_loss.py +0 -0
  229. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/chunked_loss/test_kto_loss.py +0 -0
  230. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/chunked_loss/test_orpo_loss.py +0 -0
  231. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/chunked_loss/test_simpo_loss.py +0 -0
  232. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/conftest.py +0 -0
  233. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/convergence/__init__.py +0 -0
  234. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/convergence/bf16/__init__.py +0 -0
  235. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/convergence/bf16/test_mini_models.py +0 -0
  236. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/convergence/bf16/test_mini_models_multimodal.py +0 -0
  237. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/convergence/bf16/test_mini_models_with_logits.py +0 -0
  238. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/convergence/fp32/__init__.py +0 -0
  239. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/convergence/fp32/test_mini_models.py +0 -0
  240. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
  241. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/convergence/fp32/test_mini_models_with_logits.py +0 -0
  242. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
  243. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
  244. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
  245. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
  246. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
  247. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
  248. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
  249. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
  250. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
  251. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
  252. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/resources/tiny_shakespeare.txt +0 -0
  253. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
  254. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
  255. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
  256. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_auto_model.py +0 -0
  257. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_cross_entropy.py +0 -0
  258. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_dyt.py +0 -0
  259. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_embedding.py +0 -0
  260. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_flex_attention.py +0 -0
  261. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_fused_add_rms_norm.py +0 -0
  262. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_fused_linear_jsd.py +0 -0
  263. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_fused_neighborhood_attention.py +0 -0
  264. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_geglu.py +0 -0
  265. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_group_norm.py +0 -0
  266. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_grpo_loss.py +0 -0
  267. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_jsd.py +0 -0
  268. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_kl_div.py +0 -0
  269. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_layer_norm.py +0 -0
  270. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_mm_int8int2.py +0 -0
  271. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_monkey_patch.py +0 -0
  272. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_multi_token_attention.py +0 -0
  273. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_qwen2vl_mrope.py +0 -0
  274. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_rms_norm.py +0 -0
  275. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_rope.py +0 -0
  276. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_softmax.py +0 -0
  277. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_sparsemax.py +0 -0
  278. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_swiglu.py +0 -0
  279. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_trainer_integration.py +0 -0
  280. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_transformers.py +0 -0
  281. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/transformers/test_tvd.py +0 -0
  282. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/triton/test_triton_monkey_patch.py +0 -0
  283. {liger_kernel_nightly-0.6.1.dev20250819173444 → liger_kernel_nightly-0.6.2.dev20250822031319}/test/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.6.1.dev20250819173444
3
+ Version: 0.6.2.dev20250822031319
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "liger_kernel_nightly"
7
- version = "0.6.1.dev20250819173444"
7
+ version = "0.6.2.dev20250822031319"
8
8
  description = "Efficient Triton kernels for LLM Training"
9
9
  urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -26,6 +26,7 @@ def fused_linear_cross_entropy_forward(
26
26
  softcap=None,
27
27
  return_z_loss=False,
28
28
  accum_dtype=None,
29
+ use_token_scaling=False,
29
30
  ):
30
31
  assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
31
32
  device = _input.device
@@ -89,6 +90,23 @@ def fused_linear_cross_entropy_forward(
89
90
 
90
91
  n_rows = logits_chunk.shape[0]
91
92
 
93
+ # Compute predicted probabilities for token scaling if needed
94
+ if use_token_scaling:
95
+ # Compute softmax probabilities for scaling
96
+ # We need to compute this before the cross entropy kernel modifies logits_chunk
97
+ logits_for_softmax = logits_chunk.detach().clone() # Detach to avoid gradient flow
98
+ if softcap is not None:
99
+ logits_for_softmax = softcap * torch.tanh(logits_for_softmax / softcap)
100
+
101
+ # Compute softmax to get predicted probabilities
102
+ probs = torch.softmax(logits_for_softmax, dim=-1)
103
+
104
+ # Get the predicted probability for each target token
105
+ pred_probs = torch.gather(probs, -1, target_chunk.unsqueeze(-1)).squeeze(-1)
106
+
107
+ # Store the scaling factors
108
+ scaling_factors = pred_probs.detach() # Detach to ensure no gradient flow
109
+
92
110
  # unreduced loss
93
111
  loss_1d_slice = loss_1d[start_idx:end_idx] # chunk_size,
94
112
  z_loss_1d_slice = z_loss_1d[start_idx:end_idx] if return_z_loss else None
@@ -123,11 +141,23 @@ def fused_linear_cross_entropy_forward(
123
141
  num_warps=32 if not is_hip() else 16,
124
142
  )
125
143
 
144
+ # Apply token scaling if requested
145
+ if use_token_scaling:
146
+ loss_1d_slice = loss_1d_slice * scaling_factors
147
+ if return_z_loss:
148
+ z_loss_1d_slice = z_loss_1d_slice * scaling_factors
149
+
126
150
  loss_1d[start_idx:end_idx] = loss_1d_slice
127
151
  if return_z_loss:
128
152
  z_loss_1d[start_idx:end_idx] = z_loss_1d_slice
129
153
  grad_logits_chunk = logits_chunk # chunk_size x V
130
154
 
155
+ # Apply token scaling to gradients if requested
156
+ if use_token_scaling:
157
+ # Expand scaling factors to match gradient dimensions
158
+ scaling_factors_expanded = scaling_factors.unsqueeze(-1) # chunk_size x 1
159
+ grad_logits_chunk = grad_logits_chunk * scaling_factors_expanded
160
+
131
161
  grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
132
162
 
133
163
  if grad_weight is not None:
@@ -136,7 +166,7 @@ def fused_linear_cross_entropy_forward(
136
166
  if bias is not None:
137
167
  torch.add(
138
168
  input=grad_bias,
139
- other=logits_chunk.sum(dim=0),
169
+ other=grad_logits_chunk.sum(dim=0),
140
170
  out=grad_bias,
141
171
  alpha=1.0,
142
172
  )
@@ -146,6 +176,10 @@ def fused_linear_cross_entropy_forward(
146
176
  # loss = loss_1d
147
177
  # z_loss = z_loss_1d if return_z_loss else None
148
178
 
179
+ if reduction == "none":
180
+ # Return per-token losses
181
+ loss = loss_1d
182
+ z_loss = z_loss_1d if return_z_loss else None
149
183
  else:
150
184
  loss = torch.sum(loss_1d)
151
185
  z_loss = torch.sum(z_loss_1d) if return_z_loss else None
@@ -221,6 +255,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
221
255
  softcap=None,
222
256
  return_z_loss: bool = False,
223
257
  accum_dtype=None,
258
+ use_token_scaling: bool = False,
224
259
  ):
225
260
  """
226
261
  Fusing the last linear layer with cross-entropy loss
@@ -241,6 +276,9 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
241
276
  reduction: reduction to apply
242
277
  accum_dtype (torch.dtype): the dtype of intermediate result buffers for weight and bias gradient accumulations.
243
278
  Recommended to set `accum_dtype` to higher precision, e.g. `torch.float32`, if the training is unstable with original dtype. Default: `None`, performing accumulations in original dtype
279
+ use_token_scaling (bool): whether to scale each token's loss by its predicted probability (detached).
280
+ When True, each token's loss is multiplied by the model's predicted probability for that token's true class.
281
+ Default: False.
244
282
  """
245
283
 
246
284
  loss, z_loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
@@ -256,6 +294,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
256
294
  softcap=softcap,
257
295
  return_z_loss=return_z_loss,
258
296
  accum_dtype=accum_dtype,
297
+ use_token_scaling=use_token_scaling,
259
298
  )
260
299
  # downcast to dtype and store for backward
261
300
  ctx.save_for_backward(
@@ -288,4 +327,5 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
288
327
  None,
289
328
  None,
290
329
  None,
330
+ None, # use_token_scaling
291
331
  )
@@ -65,6 +65,7 @@ def liger_fused_linear_cross_entropy(
65
65
  softcap: Optional[float] = None,
66
66
  return_z_loss: bool = False,
67
67
  accum_dtype=None,
68
+ use_token_scaling: bool = False,
68
69
  ):
69
70
  loss, z_loss = LigerFusedLinearCrossEntropyFunction.apply(
70
71
  input,
@@ -79,6 +80,7 @@ def liger_fused_linear_cross_entropy(
79
80
  softcap,
80
81
  return_z_loss,
81
82
  accum_dtype,
83
+ use_token_scaling,
82
84
  )
83
85
  if not return_z_loss:
84
86
  return loss
@@ -16,6 +16,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
16
16
  softcap: Optional[float] = None,
17
17
  return_z_loss: bool = False,
18
18
  accum_dtype: Optional[torch.dtype] = None,
19
+ use_token_scaling: bool = False,
19
20
  ):
20
21
  super().__init__()
21
22
  assert (label_smoothing >= 0) and (label_smoothing <= 1), (
@@ -34,6 +35,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
34
35
  self.softcap = softcap
35
36
  self.return_z_loss = return_z_loss
36
37
  self.accum_dtype = accum_dtype
38
+ self.use_token_scaling = use_token_scaling
37
39
 
38
40
  def forward(self, lin_weight, _input, target, bias=None):
39
41
  loss, z_loss = LigerFusedLinearCrossEntropyFunction.apply(
@@ -49,6 +51,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
49
51
  self.softcap,
50
52
  self.return_z_loss,
51
53
  self.accum_dtype,
54
+ self.use_token_scaling,
52
55
  )
53
56
  if not self.return_z_loss:
54
57
  return loss
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.6.1.dev20250819173444
3
+ Version: 0.6.2.dev20250822031319
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -352,3 +352,229 @@ def test_amp(B, T, H, V, bias, cast_dtype, accum_dtype, atol, rtol):
352
352
  atol=atol,
353
353
  rtol=rtol,
354
354
  )
355
+
356
+
357
+ def test_correctness_token_scaling():
358
+ """Test that token scaling produces the correct loss values and gradients."""
359
+ B, T, H, V = 2, 4, 8, 16
360
+ dtype = torch.float32
361
+
362
+ # Create inputs
363
+ _input = torch.randn(B * T, H, device=device, dtype=dtype, requires_grad=True)
364
+ target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
365
+
366
+ # Create weights
367
+ weight = torch.randn(V, H, device=device, dtype=dtype)
368
+ bias = torch.randn(V, device=device, dtype=dtype)
369
+
370
+ # Test using functional API with token scaling
371
+ loss_scaled = liger_fused_linear_cross_entropy(
372
+ input=_input,
373
+ weight=weight,
374
+ target=target,
375
+ bias=bias,
376
+ ignore_index=-100,
377
+ reduction="none", # Use "none" to get per-token losses
378
+ use_token_scaling=True,
379
+ )
380
+
381
+ # Compare with manual implementation
382
+ # Compute logits
383
+ logits = _input @ weight.t()
384
+ if bias is not None:
385
+ logits = logits + bias
386
+
387
+ # Compute standard cross entropy loss per token
388
+ ce_loss = torch.nn.functional.cross_entropy(logits, target, ignore_index=-100, reduction="none")
389
+
390
+ # Compute predicted probabilities for target tokens
391
+ pred_probs = torch.softmax(logits, dim=-1).gather(1, target.unsqueeze(-1)).squeeze(-1).detach()
392
+
393
+ # Scale by predicted probabilities
394
+ expected_loss = ce_loss * pred_probs
395
+
396
+ # Check that losses are close
397
+ assert torch.allclose(loss_scaled, expected_loss, atol=1e-4, rtol=1e-4)
398
+
399
+ # Test gradients
400
+ loss_scaled.sum().backward(retain_graph=True)
401
+ grad_scaled = _input.grad.clone()
402
+ _input.grad.zero_()
403
+
404
+ expected_loss.sum().backward(retain_graph=True)
405
+ grad_expected = _input.grad.clone()
406
+ _input.grad.zero_()
407
+
408
+ # Check that gradients are close
409
+ assert torch.allclose(grad_scaled, grad_expected, atol=1e-4, rtol=1e-4)
410
+
411
+
412
+ def test_correctness_token_scaling_consistency():
413
+ """Test that token scaling is consistent between functional and module APIs."""
414
+ B, T, H, V = 2, 4, 8, 16
415
+ dtype = torch.float32
416
+
417
+ # Create inputs
418
+ _input = torch.randn(B * T, H, device=device, dtype=dtype, requires_grad=True)
419
+ target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
420
+
421
+ # Create weights
422
+ weight = torch.randn(V, H, device=device, dtype=dtype)
423
+ bias = torch.randn(V, device=device, dtype=dtype)
424
+
425
+ # Test functional API
426
+ loss_functional = liger_fused_linear_cross_entropy(
427
+ input=_input,
428
+ weight=weight,
429
+ target=target,
430
+ bias=bias,
431
+ ignore_index=-100,
432
+ reduction="sum",
433
+ use_token_scaling=True,
434
+ )
435
+
436
+ # Test module API
437
+ ce_loss_module = LigerFusedLinearCrossEntropyLoss(
438
+ ignore_index=-100,
439
+ reduction="sum",
440
+ use_token_scaling=True,
441
+ )
442
+
443
+ loss_module = ce_loss_module(weight, _input, target, bias)
444
+
445
+ # Check that losses are identical
446
+ assert torch.allclose(loss_functional, loss_module, atol=1e-6, rtol=1e-6)
447
+
448
+ # Test gradients
449
+ loss_functional.backward(retain_graph=True)
450
+ grad_functional = _input.grad.clone()
451
+ _input.grad.zero_()
452
+
453
+ loss_module.backward(retain_graph=True)
454
+ grad_module = _input.grad.clone()
455
+ _input.grad.zero_()
456
+
457
+ # Check that gradients are identical
458
+ assert torch.allclose(grad_functional, grad_module, atol=1e-6, rtol=1e-6)
459
+
460
+
461
+ def test_correctness_token_scaling_functional():
462
+ """Test token scaling using the functional API."""
463
+ B, T, H, V = 2, 4, 8, 16
464
+ dtype = torch.float32
465
+
466
+ # Create inputs
467
+ _input = torch.randn(B * T, H, device=device, dtype=dtype)
468
+ x1 = _input.detach().clone().requires_grad_(True)
469
+ x2 = _input.detach().clone().requires_grad_(True)
470
+
471
+ target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
472
+
473
+ # Create weights
474
+ weight = torch.randn(V, H, device=device, dtype=dtype)
475
+ bias = torch.randn(V, device=device, dtype=dtype)
476
+
477
+ # Test using functional API with token scaling
478
+ y1 = liger_fused_linear_cross_entropy(
479
+ input=x1,
480
+ weight=weight,
481
+ target=target,
482
+ bias=bias,
483
+ ignore_index=-100,
484
+ lse_square_scale=0.0,
485
+ label_smoothing=0.0,
486
+ reduction="sum", # Use sum for easier verification
487
+ softcap=None,
488
+ return_z_loss=False,
489
+ accum_dtype=None,
490
+ use_token_scaling=True,
491
+ )
492
+
493
+ # Compare with manual implementation
494
+ # Compute logits
495
+ logits = x2 @ weight.t()
496
+ if bias is not None:
497
+ logits = logits + bias
498
+
499
+ # Compute softmax probabilities
500
+ probs = torch.softmax(logits.detach(), dim=-1) # Detach to avoid gradient flow
501
+
502
+ # Get predicted probabilities for target tokens
503
+ pred_probs = torch.gather(probs, -1, target.unsqueeze(-1)).squeeze(-1)
504
+
505
+ # Compute standard cross entropy loss
506
+ ce_loss = torch.nn.functional.cross_entropy(logits, target, ignore_index=-100, reduction="none")
507
+
508
+ # Scale by predicted probabilities
509
+ scaled_loss = ce_loss * pred_probs
510
+
511
+ # Sum over all tokens
512
+ y2 = scaled_loss.sum()
513
+
514
+ # Check that losses are close
515
+ assert torch.allclose(y1, y2, atol=1e-5, rtol=1e-5)
516
+
517
+ # Test gradients
518
+ y1.backward()
519
+ y2.backward()
520
+
521
+ # Check that gradients are close
522
+ assert torch.allclose(x1.grad, x2.grad, atol=1e-5, rtol=1e-5)
523
+
524
+
525
+ def test_correctness_token_scaling_module():
526
+ """Test token scaling using the module API."""
527
+ B, T, H, V = 2, 4, 8, 16
528
+ dtype = torch.float32
529
+
530
+ # Create inputs
531
+ _input = torch.randn(B * T, H, device=device, dtype=dtype)
532
+ x1 = _input.detach().clone().requires_grad_(True)
533
+ x2 = _input.detach().clone().requires_grad_(True)
534
+
535
+ target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
536
+
537
+ # Create module with token scaling
538
+ ce_loss = LigerFusedLinearCrossEntropyLoss(
539
+ ignore_index=-100,
540
+ reduction="sum",
541
+ use_token_scaling=True,
542
+ )
543
+
544
+ # Create weights
545
+ weight = torch.randn(V, H, device=device, dtype=dtype)
546
+ bias = torch.randn(V, device=device, dtype=dtype)
547
+
548
+ # Test using module API with token scaling
549
+ y1 = ce_loss(weight, x1, target, bias)
550
+
551
+ # Compare with manual implementation
552
+ # Compute logits
553
+ logits = x2 @ weight.t()
554
+ if bias is not None:
555
+ logits = logits + bias
556
+
557
+ # Compute softmax probabilities
558
+ probs = torch.softmax(logits.detach(), dim=-1) # Detach to avoid gradient flow
559
+
560
+ # Get predicted probabilities for target tokens
561
+ pred_probs = torch.gather(probs, -1, target.unsqueeze(-1)).squeeze(-1)
562
+
563
+ # Compute standard cross entropy loss
564
+ ce_loss_manual = torch.nn.functional.cross_entropy(logits, target, ignore_index=-100, reduction="none")
565
+
566
+ # Scale by predicted probabilities
567
+ scaled_loss = ce_loss_manual * pred_probs
568
+
569
+ # Sum over all tokens
570
+ y2 = scaled_loss.sum()
571
+
572
+ # Check that losses are close
573
+ assert torch.allclose(y1, y2, atol=1e-5, rtol=1e-5)
574
+
575
+ # Test gradients
576
+ y1.backward()
577
+ y2.backward()
578
+
579
+ # Check that gradients are close
580
+ assert torch.allclose(x1.grad, x2.grad, atol=1e-5, rtol=1e-5)