liger-kernel-nightly 0.5.10.dev20250522174514__tar.gz → 0.5.10.dev20250523162037__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.gitignore +3 -0
  2. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/Makefile +8 -2
  3. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/PKG-INFO +1 -1
  4. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/mkdocs.yml +2 -1
  5. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/pyproject.toml +1 -1
  6. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/gemma.py +11 -3
  7. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/gemma2.py +11 -3
  8. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/gemma3.py +14 -2
  9. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/glm4.py +11 -3
  10. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/llama.py +10 -2
  11. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/llava.py +5 -1
  12. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/mistral.py +8 -1
  13. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/mixtral.py +11 -3
  14. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/mllama.py +11 -3
  15. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/olmo2.py +11 -3
  16. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/paligemma.py +8 -1
  17. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/phi3.py +11 -3
  18. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/qwen2.py +11 -3
  19. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/qwen2_5_vl.py +8 -1
  20. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/qwen2_vl.py +8 -1
  21. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/qwen3.py +11 -3
  22. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/qwen3_moe.py +5 -2
  23. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel_nightly.egg-info/PKG-INFO +1 -1
  24. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/bf16/test_mini_models.py +31 -18
  25. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/fp32/test_mini_models.py +29 -16
  26. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  27. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  28. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/pull_request_template.md +0 -0
  29. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/workflows/amd-ci.yml +0 -0
  30. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/workflows/docs.yml +0 -0
  31. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/workflows/intel-ci.yml +0 -0
  32. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/workflows/nvi-ci.yml +0 -0
  33. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/workflows/publish-nightly.yml +0 -0
  34. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/workflows/publish-release.yml +0 -0
  35. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.idea/workspace.xml +0 -0
  36. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/LICENSE +0 -0
  37. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/NOTICE +0 -0
  38. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/README.md +0 -0
  39. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/README.md +0 -0
  40. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/__init__.py +0 -0
  41. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/benchmarks_visualizer.py +0 -0
  42. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/data/all_benchmark_data.csv +0 -0
  43. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/__init__.py +0 -0
  44. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
  45. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
  46. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
  47. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
  48. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_dyt.py +0 -0
  49. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_embedding.py +0 -0
  50. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
  51. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
  52. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_geglu.py +0 -0
  53. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_group_norm.py +0 -0
  54. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_jsd.py +0 -0
  55. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_kl_div.py +0 -0
  56. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_kto_loss.py +0 -0
  57. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_layer_norm.py +0 -0
  58. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
  59. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
  60. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_rms_norm.py +0 -0
  61. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_rope.py +0 -0
  62. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
  63. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_sparsemax.py +0 -0
  64. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_swiglu.py +0 -0
  65. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_tvd.py +0 -0
  66. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/utils.py +0 -0
  67. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/dev/fmt-requirements.txt +0 -0
  68. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/dev/modal/tests.py +0 -0
  69. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/dev/modal/tests_bwd.py +0 -0
  70. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/Examples.md +0 -0
  71. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/Getting-Started.md +0 -0
  72. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/High-Level-APIs.md +0 -0
  73. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/Low-Level-APIs.md +0 -0
  74. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/acknowledgement.md +0 -0
  75. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/contributing.md +0 -0
  76. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/images/banner.GIF +0 -0
  77. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/images/compose.gif +0 -0
  78. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/images/e2e-memory.png +0 -0
  79. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/images/e2e-tps.png +0 -0
  80. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/images/logo-banner.png +0 -0
  81. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/images/patch.gif +0 -0
  82. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/images/post-training.png +0 -0
  83. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/index.md +0 -0
  84. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/license.md +0 -0
  85. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/alignment/accelerate_config.yaml +0 -0
  86. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/alignment/run_orpo.py +0 -0
  87. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/README.md +0 -0
  88. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/callback.py +0 -0
  89. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/config/fsdp_config.json +0 -0
  90. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/img/gemma_7b_mem.png +0 -0
  91. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/img/gemma_7b_tp.png +0 -0
  92. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/img/llama_mem_alloc.png +0 -0
  93. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/img/llama_tps.png +0 -0
  94. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
  95. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/img/qwen_tps.png +0 -0
  96. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/launch_on_modal.py +0 -0
  97. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/requirements.txt +0 -0
  98. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/run_benchmarks.sh +0 -0
  99. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/run_gemma.sh +0 -0
  100. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/run_llama.sh +0 -0
  101. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/run_qwen.sh +0 -0
  102. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/run_qwen2_vl.sh +0 -0
  103. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/training.py +0 -0
  104. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/training_multimodal.py +0 -0
  105. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/lightning/README.md +0 -0
  106. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/lightning/requirements.txt +0 -0
  107. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/lightning/training.py +0 -0
  108. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/README.md +0 -0
  109. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/callback.py +0 -0
  110. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
  111. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
  112. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
  113. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
  114. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
  115. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
  116. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
  117. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
  118. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
  119. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/medusa_util.py +0 -0
  120. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/requirements.txt +0 -0
  121. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
  122. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/train.py +0 -0
  123. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/licenses/LICENSE-Apache-2.0 +0 -0
  124. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/licenses/LICENSE-MIT-AutoAWQ +0 -0
  125. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
  126. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/licenses/LICENSE-MIT-llmc +0 -0
  127. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/licenses/LICENSE-MIT-triton +0 -0
  128. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/setup.cfg +0 -0
  129. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/setup.py +0 -0
  130. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/__init__.py +0 -0
  131. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/README.md +0 -0
  132. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/__init__.py +0 -0
  133. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
  134. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
  135. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/functional.py +0 -0
  136. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
  137. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
  138. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
  139. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
  140. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
  141. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
  142. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
  143. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
  144. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
  145. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/env_report.py +0 -0
  146. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/__init__.py +0 -0
  147. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/cross_entropy.py +0 -0
  148. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/dyt.py +0 -0
  149. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  150. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
  151. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
  152. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
  153. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/geglu.py +0 -0
  154. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/group_norm.py +0 -0
  155. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/grpo_loss.py +0 -0
  156. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/jsd.py +0 -0
  157. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/kl_div.py +0 -0
  158. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/layer_norm.py +0 -0
  159. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
  160. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/rms_norm.py +0 -0
  161. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/rope.py +0 -0
  162. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/sparsemax.py +0 -0
  163. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/swiglu.py +0 -0
  164. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/tvd.py +0 -0
  165. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/utils.py +0 -0
  166. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/__init__.py +0 -0
  167. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/auto_model.py +0 -0
  168. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/cross_entropy.py +0 -0
  169. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/dyt.py +0 -0
  170. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  171. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/fsdp.py +0 -0
  172. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/functional.py +0 -0
  173. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
  174. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
  175. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/geglu.py +0 -0
  176. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/gema3_rms.py +0 -0
  177. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/group_norm.py +0 -0
  178. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/grpo_loss.py +0 -0
  179. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/jsd.py +0 -0
  180. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/kl_div.py +0 -0
  181. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/layer_norm.py +0 -0
  182. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/__init__.py +0 -0
  183. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
  184. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/monkey_patch.py +0 -0
  185. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
  186. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/rms_norm.py +0 -0
  187. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/rope.py +0 -0
  188. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/sparsemax.py +0 -0
  189. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/swiglu.py +0 -0
  190. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
  191. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
  192. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  193. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/tvd.py +0 -0
  194. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/triton/__init__.py +0 -0
  195. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/triton/monkey_patch.py +0 -0
  196. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/utils.py +0 -0
  197. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel_nightly.egg-info/SOURCES.txt +0 -0
  198. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
  199. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
  200. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
  201. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/__init__.py +0 -0
  202. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/__init__.py +0 -0
  203. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/test_cpo_loss.py +0 -0
  204. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/test_dpo_loss.py +0 -0
  205. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/test_grpo_loss.py +0 -0
  206. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/test_jsd_loss.py +0 -0
  207. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/test_kto_loss.py +0 -0
  208. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/test_orpo_loss.py +0 -0
  209. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/test_simpo_loss.py +0 -0
  210. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/conftest.py +0 -0
  211. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/__init__.py +0 -0
  212. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/bf16/__init__.py +0 -0
  213. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/bf16/test_mini_models_multimodal.py +0 -0
  214. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/bf16/test_mini_models_with_logits.py +0 -0
  215. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/fp32/__init__.py +0 -0
  216. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
  217. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/fp32/test_mini_models_with_logits.py +0 -0
  218. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
  219. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
  220. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
  221. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
  222. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
  223. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
  224. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
  225. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
  226. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
  227. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/tiny_shakespeare.txt +0 -0
  228. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
  229. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
  230. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
  231. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_auto_model.py +0 -0
  232. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_cross_entropy.py +0 -0
  233. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_dyt.py +0 -0
  234. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_embedding.py +0 -0
  235. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_flex_attention.py +0 -0
  236. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
  237. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_fused_linear_jsd.py +0 -0
  238. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_geglu.py +0 -0
  239. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_group_norm.py +0 -0
  240. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_grpo_loss.py +0 -0
  241. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_jsd.py +0 -0
  242. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_kl_div.py +0 -0
  243. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_layer_norm.py +0 -0
  244. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_mm_int8int2.py +0 -0
  245. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_monkey_patch.py +0 -0
  246. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_qwen2vl_mrope.py +0 -0
  247. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_rms_norm.py +0 -0
  248. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_rope.py +0 -0
  249. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_sparsemax.py +0 -0
  250. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_swiglu.py +0 -0
  251. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_trainer_integration.py +0 -0
  252. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_transformers.py +0 -0
  253. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_tvd.py +0 -0
  254. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/triton/test_triton_monkey_patch.py +0 -0
  255. {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/utils.py +0 -0
@@ -14,6 +14,9 @@ venv/
14
14
  build/
15
15
  dist/
16
16
 
17
+ # Doc Build
18
+ doc_site/
19
+
17
20
  # Lockfiles
18
21
  uv.lock
19
22
 
@@ -48,13 +48,19 @@ run-benchmarks:
48
48
  # MkDocs Configuration
49
49
  MKDOCS = mkdocs
50
50
  CONFIG_FILE = mkdocs.yml
51
+ SITE_DIR = doc_site
51
52
 
52
53
  # MkDocs targets
54
+
55
+ # Serve the documentation
53
56
  serve:
54
57
  $(MKDOCS) serve -f $(CONFIG_FILE)
55
58
 
59
+ # Build the documentation into the specified site directory
56
60
  build:
57
- $(MKDOCS) build -f $(CONFIG_FILE)
61
+ $(MKDOCS) build -f $(CONFIG_FILE) --site-dir $(SITE_DIR)
58
62
 
63
+ # Clean the output directory
59
64
  clean:
60
- rm -rf site/
65
+ rm -rf $(SITE_DIR)/
66
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.5.10.dev20250522174514
3
+ Version: 0.5.10.dev20250523162037
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -1,5 +1,6 @@
1
1
  site_name: Liger-Kernel Docs
2
- # site_url: ...
2
+ site_dir: './doc_site'
3
+ # site_url: https://linkedin.github.io/Liger-Kernel/
3
4
  # site_author: LinkedIn
4
5
  site_description: Efficient Triton Kernels for LLM Training
5
6
  theme:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "liger_kernel_nightly"
7
- version = "0.5.10.dev20250522174514"
7
+ version = "0.5.10.dev20250523162037"
8
8
  description = "Efficient Triton kernels for LLM Training"
9
9
  urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -137,6 +137,7 @@ def lce_forward(
137
137
  return_dict: Optional[bool] = None,
138
138
  cache_position: Optional[torch.LongTensor] = None,
139
139
  logits_to_keep: Union[int, torch.Tensor] = 0,
140
+ skip_logits: Optional[bool] = None,
140
141
  **loss_kwargs,
141
142
  ) -> Union[Tuple, CausalLMOutputWithPast]:
142
143
  r"""
@@ -199,8 +200,15 @@ def lce_forward(
199
200
  shift_labels = loss_kwargs.pop("shift_labels", None)
200
201
  logits = None
201
202
  loss = None
202
- # if in training mode, don't materialize logits
203
- if self.training and (labels is not None or shift_labels is not None):
203
+
204
+ if skip_logits and labels is None and shift_labels is None:
205
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
206
+
207
+ if skip_logits is None:
208
+ # By default, if in training mode, don't materialize logits
209
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
210
+
211
+ if skip_logits:
204
212
  loss = LigerForCausalLMLoss(
205
213
  hidden_states=kept_hidden_states,
206
214
  lm_head_weight=self.lm_head.weight,
@@ -209,7 +217,7 @@ def lce_forward(
209
217
  hidden_size=self.config.hidden_size,
210
218
  **loss_kwargs,
211
219
  )
212
- else: # if in inference mode materialize logits
220
+ else:
213
221
  logits = self.lm_head(kept_hidden_states)
214
222
  if labels is not None:
215
223
  loss = self.loss_function(
@@ -146,6 +146,7 @@ def lce_forward(
146
146
  return_dict: Optional[bool] = None,
147
147
  cache_position: Optional[torch.LongTensor] = None,
148
148
  logits_to_keep: Union[int, torch.Tensor] = 0,
149
+ skip_logits: Optional[bool] = None,
149
150
  **loss_kwargs,
150
151
  ) -> Union[Tuple, CausalLMOutputWithPast]:
151
152
  r"""
@@ -213,8 +214,15 @@ def lce_forward(
213
214
  shift_labels = loss_kwargs.pop("shift_labels", None)
214
215
  logits = None
215
216
  loss = None
216
- # if in training mode, don't materialize logits
217
- if self.training and (labels is not None or shift_labels is not None):
217
+
218
+ if skip_logits and labels is None and shift_labels is None:
219
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
220
+
221
+ if skip_logits is None:
222
+ # By default, if in training mode, don't materialize logits
223
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
224
+
225
+ if skip_logits:
218
226
  loss = LigerForCausalLMLoss(
219
227
  hidden_states=kept_hidden_states,
220
228
  lm_head_weight=self.lm_head.weight,
@@ -225,7 +233,7 @@ def lce_forward(
225
233
  **loss_kwargs,
226
234
  )
227
235
 
228
- else: # if in inference mode materialize logits
236
+ else:
229
237
  logits = self.lm_head(kept_hidden_states)
230
238
  if self.config.final_logit_softcapping is not None:
231
239
  logits = logits / self.config.final_logit_softcapping
@@ -35,6 +35,7 @@ def causal_forward(
35
35
  return_dict: Optional[bool] = None,
36
36
  cache_position: Optional[torch.LongTensor] = None,
37
37
  logits_to_keep: Union[int, torch.Tensor] = 0,
38
+ skip_logits: Optional[bool] = None,
38
39
  **loss_kwargs,
39
40
  ) -> Union[Tuple, CausalLMOutputWithPast]:
40
41
  r"""
@@ -101,7 +102,11 @@ def causal_forward(
101
102
  shift_labels = loss_kwargs.pop("shift_labels", None)
102
103
  loss = None
103
104
  logits = None
104
- if self.training and (labels is not None or shift_labels is not None):
105
+
106
+ if skip_logits is None:
107
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
108
+
109
+ if skip_logits:
105
110
  loss = LigerForCausalLMLoss(
106
111
  hidden_states=kept_hidden_states,
107
112
  lm_head_weight=self.lm_head.weight,
@@ -151,6 +156,7 @@ def multimodal_forward(
151
156
  output_hidden_states: Optional[bool] = None,
152
157
  return_dict: Optional[bool] = None,
153
158
  logits_to_keep: Union[int, torch.Tensor] = 0,
159
+ skip_logits: Optional[bool] = None,
154
160
  **lm_kwargs,
155
161
  ) -> Union[Tuple, Gemma3CausalLMOutputWithPast]:
156
162
  r"""
@@ -272,7 +278,13 @@ def multimodal_forward(
272
278
  loss = None
273
279
  logits = None
274
280
 
275
- if self.training and (labels is not None):
281
+ if skip_logits and labels is None:
282
+ raise ValueError("skip_logits is True, but labels is None")
283
+
284
+ if skip_logits is None:
285
+ skip_logits = self.training and (labels is not None)
286
+
287
+ if skip_logits:
276
288
  shift_hidden_states = hidden_states[..., :-1, :]
277
289
  shift_labels = labels[..., 1:]
278
290
 
@@ -26,6 +26,7 @@ def lce_forward(
26
26
  return_dict: Optional[bool] = None,
27
27
  cache_position: Optional[torch.LongTensor] = None,
28
28
  logits_to_keep: Union[int, torch.Tensor] = 0,
29
+ skip_logits: Optional[bool] = None,
29
30
  **loss_kwargs,
30
31
  ) -> Union[Tuple, CausalLMOutputWithPast]:
31
32
  r"""
@@ -89,8 +90,15 @@ def lce_forward(
89
90
  shift_labels = loss_kwargs.pop("shift_labels", None)
90
91
  logits = None
91
92
  loss = None
92
- # if in training mode, don't materialize logits
93
- if self.training and (labels is not None or shift_labels is not None):
93
+
94
+ if skip_logits and labels is None and shift_labels is None:
95
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
96
+
97
+ if skip_logits is None:
98
+ # By default, if in training mode, don't materialize logits
99
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
100
+
101
+ if skip_logits:
94
102
  loss = LigerForCausalLMLoss(
95
103
  hidden_states=kept_hidden_states,
96
104
  lm_head_weight=self.lm_head.weight,
@@ -100,7 +108,7 @@ def lce_forward(
100
108
  **loss_kwargs,
101
109
  )
102
110
 
103
- else: # if in inference mode materialize logits
111
+ else:
104
112
  logits = self.lm_head(kept_hidden_states)
105
113
  if labels is not None:
106
114
  loss = self.loss_function(
@@ -151,6 +151,7 @@ def lce_forward(
151
151
  return_dict: Optional[bool] = None,
152
152
  cache_position: Optional[torch.LongTensor] = None,
153
153
  logits_to_keep: Union[int, torch.Tensor] = 0,
154
+ skip_logits: Optional[bool] = None,
154
155
  **loss_kwargs,
155
156
  ) -> Union[Tuple, CausalLMOutputWithPast]:
156
157
  r"""
@@ -218,7 +219,14 @@ def lce_forward(
218
219
  logits = None
219
220
  loss = None
220
221
  # if in training mode, don't materialize logits
221
- if self.training and (labels is not None or shift_labels is not None):
222
+ if skip_logits and labels is None and shift_labels is None:
223
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
224
+
225
+ if skip_logits is None:
226
+ # By default, if in training mode, don't materialize logits
227
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
228
+
229
+ if skip_logits:
222
230
  loss = lce_maybe_trainable_lm_head(
223
231
  self,
224
232
  hidden_states=kept_hidden_states,
@@ -228,7 +236,7 @@ def lce_forward(
228
236
  **loss_kwargs,
229
237
  )
230
238
 
231
- else: # if in inference mode materialize logits
239
+ else:
232
240
  logits = self.lm_head(kept_hidden_states)
233
241
  if labels is not None:
234
242
  loss = self.loss_function(
@@ -223,6 +223,7 @@ def lce_forward(
223
223
  cache_position: Optional[torch.LongTensor] = None,
224
224
  logits_to_keep: Union[int, torch.Tensor] = 0,
225
225
  image_sizes: torch.Tensor = None,
226
+ skip_logits: Optional[bool] = None,
226
227
  **lm_kwargs,
227
228
  ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
228
229
  r"""
@@ -325,7 +326,10 @@ def lce_forward(
325
326
  loss = None
326
327
  logits = None
327
328
 
328
- if self.training and (labels is not None):
329
+ # Overwrite skip_logits, since llava never materializes logits
330
+ skip_logits = labels is not None
331
+
332
+ if skip_logits:
329
333
  # Shift so that tokens < n predict n
330
334
  if attention_mask is not None:
331
335
  # we use the input attention mask to shift the logits and labels, because it is 2D.
@@ -27,6 +27,7 @@ def lce_forward(
27
27
  return_dict: Optional[bool] = None,
28
28
  cache_position: Optional[torch.LongTensor] = None,
29
29
  logits_to_keep: Union[int, torch.Tensor] = 0,
30
+ skip_logits: Optional[bool] = None,
30
31
  **loss_kwargs,
31
32
  ) -> Union[Tuple, CausalLMOutputWithPast]:
32
33
  r"""
@@ -93,7 +94,13 @@ def lce_forward(
93
94
  loss = None
94
95
  logits = None
95
96
 
96
- if self.training and (labels is not None or shift_labels is not None):
97
+ if skip_logits and labels is None and shift_labels is None:
98
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
99
+
100
+ if skip_logits is None:
101
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
102
+
103
+ if skip_logits:
97
104
  loss = LigerForCausalLMLoss(
98
105
  hidden_states=kept_hidden_states,
99
106
  lm_head_weight=self.lm_head.weight,
@@ -156,6 +156,7 @@ def lce_forward(
156
156
  return_dict: Optional[bool] = None,
157
157
  cache_position: Optional[torch.LongTensor] = None,
158
158
  logits_to_keep: Union[int, torch.Tensor] = 0,
159
+ skip_logits: Optional[bool] = None,
159
160
  **loss_kwargs,
160
161
  ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
161
162
  r"""
@@ -224,8 +225,15 @@ def lce_forward(
224
225
  shift_labels = loss_kwargs.pop("shift_labels", None)
225
226
  logits = None
226
227
  loss = None
227
- # if in training mode, don't materialize logits
228
- if self.training and (labels is not None or shift_labels is not None):
228
+
229
+ if skip_logits and labels is None and shift_labels is None:
230
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
231
+
232
+ if skip_logits is None:
233
+ # By default, if in training mode, don't materialize logits
234
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
235
+
236
+ if skip_logits:
229
237
  loss = LigerForCausalLMLoss(
230
238
  hidden_states=kept_hidden_states,
231
239
  lm_head_weight=self.lm_head.weight,
@@ -235,7 +243,7 @@ def lce_forward(
235
243
  **loss_kwargs,
236
244
  )
237
245
 
238
- else: # if in inference mode materialize logits
246
+ else:
239
247
  logits = self.lm_head(kept_hidden_states)
240
248
 
241
249
  loss = None
@@ -147,6 +147,7 @@ def lce_forward(
147
147
  return_dict: Optional[bool] = None,
148
148
  cache_position: Optional[torch.LongTensor] = None,
149
149
  logits_to_keep: Union[int, torch.Tensor] = 0,
150
+ skip_logits: Optional[bool] = None,
150
151
  **loss_kwargs,
151
152
  ) -> Union[Tuple, CausalLMOutputWithPast]:
152
153
  r"""
@@ -215,8 +216,15 @@ def lce_forward(
215
216
  shift_labels = loss_kwargs.pop("shift_labels", None)
216
217
  logits = None
217
218
  loss = None
218
- # if in training mode, don't materialize logits
219
- if self.training and (labels is not None or shift_labels is not None):
219
+
220
+ if skip_logits and labels is None and shift_labels is None:
221
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
222
+
223
+ if skip_logits is None:
224
+ # By default, if in training mode, don't materialize logits
225
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
226
+
227
+ if skip_logits:
220
228
  loss = LigerForCausalLMLoss(
221
229
  hidden_states=kept_hidden_states,
222
230
  lm_head_weight=self.lm_head.weight,
@@ -226,7 +234,7 @@ def lce_forward(
226
234
  **loss_kwargs,
227
235
  )
228
236
 
229
- else: # if in inference mode materialize logits
237
+ else:
230
238
  logits = self.lm_head(kept_hidden_states)
231
239
  if labels is not None:
232
240
  loss = self.loss_function(
@@ -26,6 +26,7 @@ def lce_forward(
26
26
  return_dict: Optional[bool] = None,
27
27
  cache_position: Optional[torch.LongTensor] = None,
28
28
  logits_to_keep: Union[int, torch.Tensor] = 0,
29
+ skip_logits: Optional[bool] = None,
29
30
  **loss_kwargs,
30
31
  ) -> Union[Tuple, CausalLMOutputWithPast]:
31
32
  r"""
@@ -89,8 +90,15 @@ def lce_forward(
89
90
  shift_labels = loss_kwargs.pop("shift_labels", None)
90
91
  logits = None
91
92
  loss = None
92
- # if in training mode, don't materialize logits
93
- if self.training and (labels is not None or shift_labels is not None):
93
+
94
+ if skip_logits and labels is None and shift_labels is None:
95
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
96
+
97
+ if skip_logits is None:
98
+ # By default, if in training mode, don't materialize logits
99
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
100
+
101
+ if skip_logits:
94
102
  loss = LigerForCausalLMLoss(
95
103
  hidden_states=kept_hidden_states,
96
104
  lm_head_weight=self.lm_head.weight,
@@ -100,7 +108,7 @@ def lce_forward(
100
108
  **loss_kwargs,
101
109
  )
102
110
 
103
- else: # if in inference mode materialize logits
111
+ else:
104
112
  logits = self.lm_head(kept_hidden_states)
105
113
  if labels is not None:
106
114
  loss = self.loss_function(
@@ -216,6 +216,7 @@ def lce_forward(
216
216
  output_hidden_states: Optional[bool] = None,
217
217
  return_dict: Optional[bool] = None,
218
218
  logits_to_keep: Union[int, torch.Tensor] = 0,
219
+ skip_logits: Optional[bool] = None,
219
220
  **lm_kwargs,
220
221
  ) -> Union[Tuple, PaliGemmaCausalLMOutputWithPast]:
221
222
  r"""
@@ -331,7 +332,13 @@ def lce_forward(
331
332
  loss = None
332
333
  logits = None
333
334
 
334
- if self.training and (labels is not None):
335
+ if skip_logits and labels is None:
336
+ raise ValueError("skip_logits is True, but labels is None")
337
+
338
+ if skip_logits is None:
339
+ skip_logits = self.training and (labels is not None)
340
+
341
+ if skip_logits:
335
342
  shift_hidden_states = hidden_states[..., :-1, :]
336
343
  shift_labels = labels[..., 1:]
337
344
 
@@ -136,6 +136,7 @@ def lce_forward(
136
136
  return_dict: Optional[bool] = None,
137
137
  cache_position: Optional[torch.LongTensor] = None,
138
138
  logits_to_keep: Union[int, torch.Tensor] = 0,
139
+ skip_logits: Optional[bool] = None,
139
140
  **loss_kwargs,
140
141
  ) -> Union[Tuple, CausalLMOutputWithPast]:
141
142
  r"""
@@ -212,8 +213,15 @@ def lce_forward(
212
213
  shift_labels = loss_kwargs.pop("shift_labels", None)
213
214
  logits = None
214
215
  loss = None
215
- # if in training mode, don't materialize logits
216
- if self.training and (labels is not None or shift_labels is not None):
216
+
217
+ if skip_logits and labels is None and shift_labels is None:
218
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
219
+
220
+ if skip_logits is None:
221
+ # By default, if in training mode, don't materialize logits
222
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
223
+
224
+ if skip_logits:
217
225
  loss = LigerForCausalLMLoss(
218
226
  hidden_states=kept_hidden_states,
219
227
  lm_head_weight=self.lm_head.weight,
@@ -223,7 +231,7 @@ def lce_forward(
223
231
  **loss_kwargs,
224
232
  )
225
233
 
226
- else: # if in inference mode materialize logits
234
+ else:
227
235
  logits = self.lm_head(kept_hidden_states)
228
236
  if labels is not None:
229
237
  loss = self.loss_function(
@@ -135,6 +135,7 @@ def lce_forward(
135
135
  return_dict: Optional[bool] = None,
136
136
  cache_position: Optional[torch.LongTensor] = None,
137
137
  logits_to_keep: Union[int, torch.Tensor] = 0,
138
+ skip_logits: Optional[bool] = None,
138
139
  **loss_kwargs,
139
140
  ) -> Union[Tuple, CausalLMOutputWithPast]:
140
141
  r"""
@@ -198,8 +199,15 @@ def lce_forward(
198
199
  shift_labels = loss_kwargs.pop("shift_labels", None)
199
200
  logits = None
200
201
  loss = None
201
- # if in training mode, don't materialize logits
202
- if self.training and (labels is not None or shift_labels is not None):
202
+
203
+ if skip_logits and labels is None and shift_labels is None:
204
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
205
+
206
+ if skip_logits is None:
207
+ # By default, if in training mode, don't materialize logits
208
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
209
+
210
+ if skip_logits:
203
211
  loss = LigerForCausalLMLoss(
204
212
  hidden_states=kept_hidden_states,
205
213
  lm_head_weight=self.lm_head.weight,
@@ -209,7 +217,7 @@ def lce_forward(
209
217
  **loss_kwargs,
210
218
  )
211
219
 
212
- else: # if in inference mode materialize logits
220
+ else:
213
221
  logits = self.lm_head(kept_hidden_states)
214
222
  if labels is not None:
215
223
  loss = self.loss_function(
@@ -30,6 +30,7 @@ def lce_forward(
30
30
  rope_deltas: Optional[torch.LongTensor] = None,
31
31
  cache_position: Optional[torch.LongTensor] = None,
32
32
  second_per_grid_ts: Optional[torch.Tensor] = None,
33
+ skip_logits: Optional[bool] = None,
33
34
  **loss_kwargs,
34
35
  ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
35
36
  r"""
@@ -161,7 +162,13 @@ def lce_forward(
161
162
  loss = None
162
163
  logits = None
163
164
 
164
- if self.training and (labels is not None or shift_labels is not None):
165
+ if skip_logits and labels is None and shift_labels is None:
166
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
167
+
168
+ if skip_logits is None:
169
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
170
+
171
+ if skip_logits:
165
172
  loss = LigerForCausalLMLoss(
166
173
  hidden_states=hidden_states,
167
174
  lm_head_weight=self.lm_head.weight,
@@ -31,6 +31,7 @@ def lce_forward(
31
31
  video_grid_thw: Optional[torch.LongTensor] = None,
32
32
  rope_deltas: Optional[torch.LongTensor] = None,
33
33
  cache_position: Optional[torch.LongTensor] = None,
34
+ skip_logits: Optional[bool] = None,
34
35
  **loss_kwargs,
35
36
  ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
36
37
  r"""
@@ -165,7 +166,13 @@ def lce_forward(
165
166
  loss = None
166
167
  logits = None
167
168
 
168
- if self.training and (labels is not None or shift_labels is not None):
169
+ if skip_logits and labels is None and shift_labels is None:
170
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
171
+
172
+ if skip_logits is None:
173
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
174
+
175
+ if skip_logits:
169
176
  loss = LigerForCausalLMLoss(
170
177
  hidden_states=hidden_states,
171
178
  lm_head_weight=self.lm_head.weight,
@@ -22,6 +22,7 @@ def lce_forward(
22
22
  output_hidden_states: Optional[bool] = None,
23
23
  cache_position: Optional[torch.LongTensor] = None,
24
24
  logits_to_keep: Union[int, torch.Tensor] = 0,
25
+ skip_logits: Optional[bool] = None,
25
26
  **kwargs,
26
27
  ) -> CausalLMOutputWithPast:
27
28
  r"""
@@ -82,8 +83,15 @@ def lce_forward(
82
83
  shift_labels = kwargs.pop("shift_labels", None)
83
84
  logits = None
84
85
  loss = None
85
- # if in training mode, don't materialize logits
86
- if self.training and (labels is not None or shift_labels is not None):
86
+
87
+ if skip_logits and labels is None and shift_labels is None:
88
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
89
+
90
+ if skip_logits is None:
91
+ # By default, if in training mode, don't materialize logits
92
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
93
+
94
+ if skip_logits:
87
95
  loss = LigerForCausalLMLoss(
88
96
  hidden_states=kept_hidden_states,
89
97
  lm_head_weight=self.lm_head.weight,
@@ -93,7 +101,7 @@ def lce_forward(
93
101
  **kwargs,
94
102
  )
95
103
 
96
- else: # if in inference mode materialize logits
104
+ else:
97
105
  logits = self.lm_head(kept_hidden_states)
98
106
  if labels is not None:
99
107
  loss = self.loss_function(
@@ -25,6 +25,7 @@ def lce_forward(
25
25
  output_router_logits: Optional[bool] = None,
26
26
  cache_position: Optional[torch.LongTensor] = None,
27
27
  logits_to_keep: Union[int, torch.Tensor] = 0,
28
+ skip_logits: Optional[bool] = None,
28
29
  **loss_kwargs,
29
30
  ) -> MoeCausalLMOutputWithPast:
30
31
  r"""
@@ -91,8 +92,10 @@ def lce_forward(
91
92
  logits = None
92
93
  loss = None
93
94
 
94
- # if in training mode, do not materialize logits
95
- if self.training and (labels is not None or shift_labels is not None):
95
+ if skip_logits is None:
96
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
97
+
98
+ if skip_logits:
96
99
  loss = LigerForCausalLMLoss(
97
100
  hidden_states=kept_hidden_states,
98
101
  lm_head_weight=self.lm_head.weight,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.5.10.dev20250522174514
3
+ Version: 0.5.10.dev20250523162037
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation