liger-kernel-nightly 0.6.3.dev20251121195543__tar.gz → 0.6.3.dev20251121200119__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (305) hide show
  1. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/PKG-INFO +2 -1
  2. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/README.md +1 -0
  3. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/pyproject.toml +1 -1
  4. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/__init__.py +3 -0
  5. liger_kernel_nightly-0.6.3.dev20251121200119/src/liger_kernel/transformers/model/olmo3.py +142 -0
  6. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/monkey_patch.py +70 -1
  7. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/swiglu.py +1 -1
  8. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel_nightly.egg-info/PKG-INFO +2 -1
  9. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel_nightly.egg-info/SOURCES.txt +1 -0
  10. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/convergence/bf16/test_mini_models.py +60 -1
  11. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/convergence/bf16/test_mini_models_with_logits.py +60 -1
  12. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/convergence/fp32/test_mini_models.py +57 -1
  13. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/convergence/fp32/test_mini_models_with_logits.py +57 -1
  14. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_monkey_patch.py +55 -0
  15. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/utils.py +12 -0
  16. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  17. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  18. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/.github/pull_request_template.md +0 -0
  19. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/.github/workflows/amd-ci.yml +0 -0
  20. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/.github/workflows/benchmark.yml +0 -0
  21. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/.github/workflows/docs.yml +0 -0
  22. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/.github/workflows/intel-ci.yml +0 -0
  23. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/.github/workflows/nvi-ci.yml +0 -0
  24. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/.github/workflows/publish-nightly.yml +0 -0
  25. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/.github/workflows/publish-release.yml +0 -0
  26. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/.gitignore +0 -0
  27. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/LICENSE +0 -0
  28. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/Makefile +0 -0
  29. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/NOTICE +0 -0
  30. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/README.md +0 -0
  31. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/__init__.py +0 -0
  32. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/benchmarks_visualizer.py +0 -0
  33. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/data/all_benchmark_data.csv +0 -0
  34. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/__init__.py +0 -0
  35. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
  36. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
  37. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
  38. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
  39. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
  40. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_dyt.py +0 -0
  41. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_embedding.py +0 -0
  42. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_fused_add_rms_norm.py +0 -0
  43. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
  44. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
  45. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
  46. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_geglu.py +0 -0
  47. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_group_norm.py +0 -0
  48. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_grpo_loss.py +0 -0
  49. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_jsd.py +0 -0
  50. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_kl_div.py +0 -0
  51. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_kto_loss.py +0 -0
  52. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_layer_norm.py +0 -0
  53. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_llama4_rope.py +0 -0
  54. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
  55. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
  56. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_poly_norm.py +0 -0
  57. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
  58. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_rms_norm.py +0 -0
  59. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_rope.py +0 -0
  60. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
  61. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_softmax.py +0 -0
  62. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
  63. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_sparsemax.py +0 -0
  64. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_swiglu.py +0 -0
  65. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_tiled_mlp.py +0 -0
  66. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/benchmark_tvd.py +0 -0
  67. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/benchmark/scripts/utils.py +0 -0
  68. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/dev/fmt-requirements.txt +0 -0
  69. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/dev/modal/benchmarks.py +0 -0
  70. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/dev/modal/tests.py +0 -0
  71. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/dev/modal/tests_bwd.py +0 -0
  72. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/Examples.md +0 -0
  73. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/Getting-Started.md +0 -0
  74. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/High-Level-APIs.md +0 -0
  75. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/Low-Level-APIs.md +0 -0
  76. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/acknowledgement.md +0 -0
  77. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/contributing.md +0 -0
  78. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/images/banner.GIF +0 -0
  79. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/images/compose.gif +0 -0
  80. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/images/e2e-memory.png +0 -0
  81. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/images/e2e-tps.png +0 -0
  82. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/images/logo-banner.png +0 -0
  83. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/images/patch.gif +0 -0
  84. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/images/post-training.png +0 -0
  85. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/index.md +0 -0
  86. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/docs/license.md +0 -0
  87. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/alignment/accelerate_config.yaml +0 -0
  88. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/alignment/run_orpo.py +0 -0
  89. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/README.md +0 -0
  90. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/callback.py +0 -0
  91. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/config/fsdp_config.json +0 -0
  92. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/img/gemma_7b_mem.png +0 -0
  93. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/img/gemma_7b_tp.png +0 -0
  94. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/img/llama_mem_alloc.png +0 -0
  95. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/img/llama_tps.png +0 -0
  96. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
  97. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/img/qwen_tps.png +0 -0
  98. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/launch_on_modal.py +0 -0
  99. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/requirements.txt +0 -0
  100. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/run_benchmarks.sh +0 -0
  101. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/run_gemma.sh +0 -0
  102. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/run_llama.sh +0 -0
  103. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/run_qwen.sh +0 -0
  104. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/run_qwen2_vl.sh +0 -0
  105. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/training.py +0 -0
  106. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/huggingface/training_multimodal.py +0 -0
  107. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/lightning/README.md +0 -0
  108. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/lightning/requirements.txt +0 -0
  109. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/lightning/training.py +0 -0
  110. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/README.md +0 -0
  111. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/callback.py +0 -0
  112. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
  113. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
  114. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
  115. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
  116. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
  117. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
  118. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
  119. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
  120. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
  121. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/medusa_util.py +0 -0
  122. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/requirements.txt +0 -0
  123. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
  124. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/examples/medusa/train.py +0 -0
  125. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/licenses/LICENSE-Apache-2.0 +0 -0
  126. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/licenses/LICENSE-MIT-AutoAWQ +0 -0
  127. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
  128. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/licenses/LICENSE-MIT-llmc +0 -0
  129. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/licenses/LICENSE-MIT-triton +0 -0
  130. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/mkdocs.yml +0 -0
  131. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/setup.cfg +0 -0
  132. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/setup.py +0 -0
  133. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/__init__.py +0 -0
  134. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/README.md +0 -0
  135. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/__init__.py +0 -0
  136. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
  137. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
  138. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
  139. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/functional.py +0 -0
  140. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
  141. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
  142. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
  143. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
  144. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
  145. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
  146. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
  147. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
  148. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
  149. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/env_report.py +0 -0
  150. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/__init__.py +0 -0
  151. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/cross_entropy.py +0 -0
  152. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/dyt.py +0 -0
  153. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  154. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
  155. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/fused_add_rms_norm.py +0 -0
  156. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
  157. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
  158. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
  159. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/geglu.py +0 -0
  160. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/group_norm.py +0 -0
  161. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/grpo_loss.py +0 -0
  162. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/jsd.py +0 -0
  163. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/kl_div.py +0 -0
  164. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/layer_norm.py +0 -0
  165. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/llama4_rope.py +0 -0
  166. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/multi_token_attention.py +0 -0
  167. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/poly_norm.py +0 -0
  168. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
  169. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/rms_norm.py +0 -0
  170. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/rope.py +0 -0
  171. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/softmax.py +0 -0
  172. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/sparsemax.py +0 -0
  173. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/swiglu.py +0 -0
  174. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/tiled_mlp.py +0 -0
  175. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/tvd.py +0 -0
  176. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/ops/utils.py +0 -0
  177. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/auto_model.py +0 -0
  178. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/cross_entropy.py +0 -0
  179. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/dyt.py +0 -0
  180. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/experimental/__init__.py +0 -0
  181. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  182. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/fsdp.py +0 -0
  183. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/functional.py +0 -0
  184. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/fused_add_rms_norm.py +0 -0
  185. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
  186. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
  187. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
  188. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/geglu.py +0 -0
  189. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/group_norm.py +0 -0
  190. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/grpo_loss.py +0 -0
  191. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/jsd.py +0 -0
  192. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/kl_div.py +0 -0
  193. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/layer_norm.py +0 -0
  194. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/llama4_rope.py +0 -0
  195. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/__init__.py +0 -0
  196. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/falcon_h1.py +0 -0
  197. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/gemma.py +0 -0
  198. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/gemma2.py +0 -0
  199. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/gemma3.py +0 -0
  200. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/glm4.py +0 -0
  201. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/glm4v.py +0 -0
  202. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/glm4v_moe.py +0 -0
  203. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/hunyuan_v1.py +0 -0
  204. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/internvl.py +0 -0
  205. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/llama.py +0 -0
  206. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/llama4.py +0 -0
  207. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/llava.py +0 -0
  208. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
  209. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/mistral.py +0 -0
  210. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/mixtral.py +0 -0
  211. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/mllama.py +0 -0
  212. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/olmo2.py +0 -0
  213. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/output_classes.py +0 -0
  214. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/paligemma.py +0 -0
  215. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/phi3.py +0 -0
  216. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/qwen2.py +0 -0
  217. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
  218. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
  219. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/qwen3.py +0 -0
  220. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
  221. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/qwen3_next.py +0 -0
  222. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/qwen3_vl.py +0 -0
  223. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/qwen3_vl_moe.py +0 -0
  224. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/smollm3.py +0 -0
  225. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/model/smolvlm.py +0 -0
  226. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
  227. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/poly_norm.py +0 -0
  228. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
  229. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/rms_norm.py +0 -0
  230. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/rope.py +0 -0
  231. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/softmax.py +0 -0
  232. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/sparsemax.py +0 -0
  233. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/tiled_mlp.py +0 -0
  234. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
  235. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
  236. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  237. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/transformers/tvd.py +0 -0
  238. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/triton/__init__.py +0 -0
  239. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/triton/monkey_patch.py +0 -0
  240. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel/utils.py +0 -0
  241. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
  242. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
  243. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
  244. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/__init__.py +0 -0
  245. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/chunked_loss/__init__.py +0 -0
  246. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/chunked_loss/test_cosine_loss.py +0 -0
  247. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/chunked_loss/test_cpo_loss.py +0 -0
  248. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/chunked_loss/test_dpo_loss.py +0 -0
  249. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/chunked_loss/test_grpo_loss.py +0 -0
  250. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/chunked_loss/test_jsd_loss.py +0 -0
  251. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/chunked_loss/test_kto_loss.py +0 -0
  252. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/chunked_loss/test_orpo_loss.py +0 -0
  253. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/chunked_loss/test_simpo_loss.py +0 -0
  254. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/conftest.py +0 -0
  255. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/convergence/__init__.py +0 -0
  256. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/convergence/bf16/__init__.py +0 -0
  257. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/convergence/bf16/test_mini_models_multimodal.py +0 -0
  258. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/convergence/fp32/__init__.py +0 -0
  259. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
  260. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
  261. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
  262. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/fake_configs/HuggingFaceTB/SmolVLM2-256M-Video-Instruct/tokenizer_config.json +0 -0
  263. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
  264. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
  265. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
  266. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/fake_configs/OpenGVLab/InternVL3-1B-hf/tokenizer_config.json +0 -0
  267. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
  268. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
  269. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/fake_configs/Qwen/Qwen3-VL-4B-Instruct/tokenizer_config.json +0 -0
  270. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
  271. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
  272. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
  273. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/tiny_shakespeare.txt +0 -0
  274. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
  275. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
  276. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
  277. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_auto_model.py +0 -0
  278. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_cross_entropy.py +0 -0
  279. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_dyt.py +0 -0
  280. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_embedding.py +0 -0
  281. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_flex_attention.py +0 -0
  282. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_fused_add_rms_norm.py +0 -0
  283. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
  284. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_fused_linear_jsd.py +0 -0
  285. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_fused_neighborhood_attention.py +0 -0
  286. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_geglu.py +0 -0
  287. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_group_norm.py +0 -0
  288. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_grpo_loss.py +0 -0
  289. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_jsd.py +0 -0
  290. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_kl_div.py +0 -0
  291. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_layer_norm.py +0 -0
  292. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_mm_int8int2.py +0 -0
  293. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_multi_token_attention.py +0 -0
  294. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_poly_norm.py +0 -0
  295. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_qwen2vl_mrope.py +0 -0
  296. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_rms_norm.py +0 -0
  297. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_rope.py +0 -0
  298. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_softmax.py +0 -0
  299. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_sparsemax.py +0 -0
  300. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_swiglu.py +0 -0
  301. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_tiled_mlp.py +0 -0
  302. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_trainer_integration.py +0 -0
  303. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_transformers.py +0 -0
  304. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/transformers/test_tvd.py +0 -0
  305. {liger_kernel_nightly-0.6.3.dev20251121195543 → liger_kernel_nightly-0.6.3.dev20251121200119}/test/triton/test_triton_monkey_patch.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.6.3.dev20251121195543
3
+ Version: 0.6.3.dev20251121200119
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -310,6 +310,7 @@ loss.backward()
310
310
  | Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
311
311
  | Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
312
312
  | OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
313
+ | Olmo3 | `liger_kernel.transformers.apply_liger_kernel_to_olmo3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
313
314
  | GLM-4 | `liger_kernel.transformers.apply_liger_kernel_to_glm4` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
314
315
  | InternVL3 | `liger_kernel.transformers.apply_liger_kernel_to_internvl` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
315
316
  | HunyuanV1 | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_dense` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -262,6 +262,7 @@ loss.backward()
262
262
  | Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
263
263
  | Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
264
264
  | OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
265
+ | Olmo3 | `liger_kernel.transformers.apply_liger_kernel_to_olmo3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
265
266
  | GLM-4 | `liger_kernel.transformers.apply_liger_kernel_to_glm4` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
266
267
  | InternVL3 | `liger_kernel.transformers.apply_liger_kernel_to_internvl` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
267
268
  | HunyuanV1 | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_dense` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "liger_kernel_nightly"
7
- version = "0.6.3.dev20251121195543"
7
+ version = "0.6.3.dev20251121200119"
8
8
  description = "Efficient Triton kernels for LLM Training"
9
9
  urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -52,6 +52,7 @@ if TYPE_CHECKING:
52
52
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_mixtral # noqa: F401
53
53
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_mllama # noqa: F401
54
54
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_olmo2 # noqa: F401
55
+ from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_olmo3 # noqa: F401
55
56
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_paligemma # noqa: F401
56
57
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_phi3 # noqa: F401
57
58
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen2 # noqa: F401
@@ -118,6 +119,7 @@ def __getattr__(name: str):
118
119
  "apply_liger_kernel_to_mixtral",
119
120
  "apply_liger_kernel_to_mllama",
120
121
  "apply_liger_kernel_to_olmo2",
122
+ "apply_liger_kernel_to_olmo3",
121
123
  "apply_liger_kernel_to_paligemma",
122
124
  "apply_liger_kernel_to_phi3",
123
125
  "apply_liger_kernel_to_qwen2",
@@ -194,6 +196,7 @@ if _TRANSFORMERS_AVAILABLE:
194
196
  "apply_liger_kernel_to_mixtral",
195
197
  "apply_liger_kernel_to_mllama",
196
198
  "apply_liger_kernel_to_olmo2",
199
+ "apply_liger_kernel_to_olmo3",
197
200
  "apply_liger_kernel_to_paligemma",
198
201
  "apply_liger_kernel_to_phi3",
199
202
  "apply_liger_kernel_to_qwen2",
@@ -0,0 +1,142 @@
1
+ from typing import List
2
+ from typing import Optional
3
+ from typing import Tuple
4
+ from typing import Union
5
+
6
+ import torch
7
+
8
+ from transformers.modeling_outputs import BaseModelOutputWithPast
9
+ from transformers.utils.deprecation import deprecate_kwarg
10
+
11
+ from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
12
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
13
+ from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
14
+
15
+
16
+ @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
17
+ def lce_forward(
18
+ self,
19
+ input_ids: torch.LongTensor = None,
20
+ attention_mask: Optional[torch.Tensor] = None,
21
+ position_ids: Optional[torch.LongTensor] = None,
22
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
23
+ inputs_embeds: Optional[torch.FloatTensor] = None,
24
+ labels: Optional[torch.LongTensor] = None,
25
+ use_cache: Optional[bool] = None,
26
+ output_attentions: Optional[bool] = None,
27
+ output_hidden_states: Optional[bool] = None,
28
+ return_dict: Optional[bool] = None,
29
+ cache_position: Optional[torch.LongTensor] = None,
30
+ logits_to_keep: Union[int, torch.Tensor] = 0,
31
+ skip_logits: Optional[bool] = None,
32
+ **kwargs,
33
+ ) -> Union[Tuple, LigerCausalLMOutputWithPast]:
34
+ r"""
35
+ Args:
36
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
37
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
38
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
39
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
40
+
41
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
42
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
43
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
44
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
45
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
46
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
47
+
48
+ Returns:
49
+
50
+ Example:
51
+
52
+ ```python
53
+ >>> from transformers import AutoTokenizer, Olmo3ForCausalLM
54
+
55
+ >>> model = Olmo3ForCausalLM.from_pretrained("allenai/Olmo-3-7B-Instruct")
56
+ >>> tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo-3-7B-Instruct")
57
+
58
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
59
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
60
+
61
+ >>> # Generate
62
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
63
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
64
+ 'Hey, are you conscious? Can you talk to me?\nI’m not sure if you’re conscious of this, but I’m'
65
+ ```
66
+ """
67
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
68
+ output_hidden_states = (
69
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
70
+ )
71
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
72
+
73
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
74
+ outputs: BaseModelOutputWithPast = self.model(
75
+ input_ids=input_ids,
76
+ attention_mask=attention_mask,
77
+ position_ids=position_ids,
78
+ past_key_values=past_key_values,
79
+ inputs_embeds=inputs_embeds,
80
+ use_cache=use_cache,
81
+ output_attentions=output_attentions,
82
+ output_hidden_states=output_hidden_states,
83
+ return_dict=return_dict,
84
+ cache_position=cache_position,
85
+ **kwargs,
86
+ )
87
+
88
+ hidden_states = outputs[0]
89
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
90
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
91
+ kept_hidden_states = hidden_states[:, slice_indices, :]
92
+
93
+ shift_labels = kwargs.pop("shift_labels", None)
94
+ logits = None
95
+ loss = None
96
+ token_accuracy = None
97
+
98
+ if skip_logits and labels is None and shift_labels is None:
99
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
100
+
101
+ if skip_logits is None:
102
+ # By default, if in training mode, don't materialize logits
103
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
104
+
105
+ # Compute loss
106
+ if skip_logits:
107
+ result = LigerForCausalLMLoss(
108
+ hidden_states=kept_hidden_states,
109
+ lm_head_weight=self.lm_head.weight,
110
+ labels=labels,
111
+ shift_labels=shift_labels,
112
+ hidden_size=self.config.hidden_size,
113
+ **kwargs,
114
+ )
115
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
116
+
117
+ else:
118
+ logits = self.lm_head(kept_hidden_states)
119
+ if labels is not None or shift_labels is not None:
120
+ loss = self.loss_function(
121
+ logits=logits,
122
+ labels=labels,
123
+ shift_labels=shift_labels,
124
+ vocab_size=self.config.vocab_size,
125
+ **kwargs,
126
+ )
127
+
128
+ if not return_dict:
129
+ output = (logits,) + outputs[1:]
130
+ output = ((loss,) + output) if loss is not None else output
131
+ output = output + (token_accuracy,) if token_accuracy is not None else output
132
+ return output
133
+
134
+ # Return custom output class with token_accuracy field
135
+ return LigerCausalLMOutputWithPast(
136
+ loss=loss,
137
+ logits=logits,
138
+ past_key_values=outputs.past_key_values,
139
+ hidden_states=outputs.hidden_states,
140
+ attentions=outputs.attentions,
141
+ token_accuracy=token_accuracy,
142
+ )
@@ -1928,6 +1928,74 @@ def apply_liger_kernel_to_olmo2(
1928
1928
  _patch_rms_norm_module(decoder_layer.post_feedforward_layernorm, in_place=False)
1929
1929
 
1930
1930
 
1931
+ def apply_liger_kernel_to_olmo3(
1932
+ rope: bool = True,
1933
+ cross_entropy: bool = False,
1934
+ fused_linear_cross_entropy: bool = True,
1935
+ rms_norm: bool = True,
1936
+ swiglu: bool = True,
1937
+ model: PreTrainedModel = None,
1938
+ ) -> None:
1939
+ """
1940
+ Apply Liger kernels to replace original implementation in HuggingFace Olmo3 models.
1941
+
1942
+ Args:
1943
+ rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
1944
+ cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
1945
+ fused_linear_cross_entropy (bool):
1946
+ Whether to apply Liger's fused linear cross entropy loss. Default is True.
1947
+ `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
1948
+ If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
1949
+ rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
1950
+ swiglu (bool): Whether to apply Liger's SwiGLU to Olmo3MLP. Default is True.
1951
+ model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
1952
+ loaded. Default is None.
1953
+ """
1954
+ assert not (cross_entropy and fused_linear_cross_entropy), (
1955
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
1956
+ )
1957
+
1958
+ from transformers.models.olmo3 import modeling_olmo3
1959
+ from transformers.models.olmo3.modeling_olmo3 import Olmo3Model
1960
+
1961
+ from liger_kernel.transformers.model.olmo3 import lce_forward as olmo3_lce_forward
1962
+ from liger_kernel.transformers.rms_norm import LigerRMSNormForOlmo2
1963
+
1964
+ # Olmo3 arch is very similar to Olmo2, so we can reuse all these components in the same way.
1965
+ if rope:
1966
+ modeling_olmo3.apply_rotary_pos_emb = liger_rotary_pos_emb
1967
+ if rms_norm:
1968
+ modeling_olmo3.Olmo3RMSNorm = LigerRMSNormForOlmo2 # same as olmo2
1969
+ if swiglu:
1970
+ modeling_olmo3.Olmo3MLP = LigerSwiGLUMLP
1971
+ if cross_entropy:
1972
+ from transformers.loss.loss_utils import nn
1973
+
1974
+ nn.functional.cross_entropy = liger_cross_entropy
1975
+ if fused_linear_cross_entropy:
1976
+ if model is not None:
1977
+ model.forward = MethodType(olmo3_lce_forward, model)
1978
+ else:
1979
+ modeling_olmo3.Olmo3ForCausalLM.forward = olmo3_lce_forward
1980
+
1981
+ if model is not None:
1982
+ # The model instance already exists, so we need to additionally patch the
1983
+ # instance variables that reference already-instantiated modules
1984
+
1985
+ # get the base model from the model instance
1986
+ base_model: Olmo3Model = getattr(model, model.base_model_prefix, model)
1987
+
1988
+ if rms_norm:
1989
+ _patch_rms_norm_module(base_model.norm)
1990
+
1991
+ for decoder_layer in base_model.layers:
1992
+ if swiglu:
1993
+ _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
1994
+ if rms_norm:
1995
+ _patch_rms_norm_module(decoder_layer.post_attention_layernorm, in_place=False)
1996
+ _patch_rms_norm_module(decoder_layer.post_feedforward_layernorm, in_place=False)
1997
+
1998
+
1931
1999
  def apply_liger_kernel_to_glm4(
1932
2000
  rope: bool = False,
1933
2001
  cross_entropy: bool = False,
@@ -2589,7 +2657,7 @@ def apply_liger_kernel_to_hunyuan_v1_dense(
2589
2657
  from transformers.loss.loss_utils import nn
2590
2658
 
2591
2659
  nn.functional.cross_entropy = liger_cross_entropy
2592
-
2660
+
2593
2661
  if fused_linear_cross_entropy:
2594
2662
  if model is not None:
2595
2663
  model.forward = MethodType(hunyuan_v1_lce_forward, model)
@@ -2695,6 +2763,7 @@ MODEL_TYPE_TO_APPLY_LIGER_FN = {
2695
2763
  "mistral": apply_liger_kernel_to_mistral,
2696
2764
  "mixtral": apply_liger_kernel_to_mixtral,
2697
2765
  "olmo2": apply_liger_kernel_to_olmo2,
2766
+ "olmo3": apply_liger_kernel_to_olmo3,
2698
2767
  "qwen2": apply_liger_kernel_to_qwen2,
2699
2768
  "qwen3": apply_liger_kernel_to_qwen3,
2700
2769
  "qwen3_moe": apply_liger_kernel_to_qwen3_moe,
@@ -93,4 +93,4 @@ class LigerHunyuanV1SwiGLUMLP(nn.Module):
93
93
  raise ValueError(f"Activation function {config.hidden_act} not supported.")
94
94
 
95
95
  def forward(self, x):
96
- return self.down_proj(LigerSiLUMulFunction.apply(self.gate_proj(x), self.up_proj(x)))
96
+ return self.down_proj(LigerSiLUMulFunction.apply(self.gate_proj(x), self.up_proj(x)))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.6.3.dev20251121195543
3
+ Version: 0.6.3.dev20251121200119
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -310,6 +310,7 @@ loss.backward()
310
310
  | Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
311
311
  | Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
312
312
  | OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
313
+ | Olmo3 | `liger_kernel.transformers.apply_liger_kernel_to_olmo3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
313
314
  | GLM-4 | `liger_kernel.transformers.apply_liger_kernel_to_glm4` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
314
315
  | InternVL3 | `liger_kernel.transformers.apply_liger_kernel_to_internvl` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
315
316
  | HunyuanV1 | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_dense` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -210,6 +210,7 @@ src/liger_kernel/transformers/model/mistral.py
210
210
  src/liger_kernel/transformers/model/mixtral.py
211
211
  src/liger_kernel/transformers/model/mllama.py
212
212
  src/liger_kernel/transformers/model/olmo2.py
213
+ src/liger_kernel/transformers/model/olmo3.py
213
214
  src/liger_kernel/transformers/model/output_classes.py
214
215
  src/liger_kernel/transformers/model/paligemma.py
215
216
  src/liger_kernel/transformers/model/phi3.py
@@ -40,6 +40,7 @@ from liger_kernel.transformers import apply_liger_kernel_to_mistral
40
40
  from liger_kernel.transformers import apply_liger_kernel_to_mixtral
41
41
  from liger_kernel.transformers import apply_liger_kernel_to_mllama
42
42
  from liger_kernel.transformers import apply_liger_kernel_to_olmo2
43
+ from liger_kernel.transformers import apply_liger_kernel_to_olmo3
43
44
  from liger_kernel.transformers import apply_liger_kernel_to_phi3
44
45
  from liger_kernel.transformers import apply_liger_kernel_to_qwen2
45
46
  from liger_kernel.transformers import apply_liger_kernel_to_qwen2_5_vl
@@ -74,6 +75,7 @@ from test.utils import revert_liger_kernel_to_mistral
74
75
  from test.utils import revert_liger_kernel_to_mixtral
75
76
  from test.utils import revert_liger_kernel_to_mllama
76
77
  from test.utils import revert_liger_kernel_to_olmo2
78
+ from test.utils import revert_liger_kernel_to_olmo3
77
79
  from test.utils import revert_liger_kernel_to_phi3
78
80
  from test.utils import revert_liger_kernel_to_qwen2
79
81
  from test.utils import revert_liger_kernel_to_qwen2_5_vl
@@ -194,6 +196,15 @@ try:
194
196
  except ImportError:
195
197
  OLMO2_AVAILABLE = False
196
198
 
199
+ try:
200
+ # OLMO3 is only available in transformers>=4.57.0
201
+ from transformers.models.olmo3.configuration_olmo3 import Olmo3Config
202
+ from transformers.models.olmo3.modeling_olmo3 import Olmo3ForCausalLM
203
+
204
+ OLMO3_AVAILABLE = True
205
+ except ImportError:
206
+ OLMO3_AVAILABLE = False
207
+
197
208
  try:
198
209
  # Glm4 is only available in transformers>=4.51.3
199
210
  from transformers.models.glm4.configuration_glm4 import Glm4Config
@@ -1009,6 +1020,35 @@ if OLMO2_AVAILABLE:
1009
1020
  ),
1010
1021
  )
1011
1022
 
1023
+ if OLMO3_AVAILABLE:
1024
+ MINI_MODEL_SETUPS["mini_olmo3"] = MiniModelConfig(
1025
+ liger_kernel_patch_func=apply_liger_kernel_to_olmo3,
1026
+ liger_kernel_patch_revert_func=revert_liger_kernel_to_olmo3,
1027
+ model_class=Olmo3ForCausalLM,
1028
+ mini_model_config=Olmo3Config(
1029
+ bos_token_id=1, # 128000
1030
+ eos_token_id=2, # 128001
1031
+ pad_token_id=2,
1032
+ cross_attention_layers=None,
1033
+ dropout=0,
1034
+ hidden_act="silu",
1035
+ hidden_size=1024, # 4096
1036
+ initializer_range=0.02,
1037
+ intermediate_size=2048, # 14336
1038
+ max_position_embeddings=4096,
1039
+ num_attention_heads=8, # 32
1040
+ num_hidden_layers=4, # 40
1041
+ num_key_value_heads=2, # 8
1042
+ rms_norm_eps=1e-5,
1043
+ rope_scaling=None,
1044
+ rope_theta=500_000,
1045
+ tie_word_embeddings=False,
1046
+ use_cache=True,
1047
+ vocab_size=32000, # 128256,
1048
+ attn_implementation="sdpa", # default value, pytorch native attention
1049
+ ),
1050
+ )
1051
+
1012
1052
  if GLM4_AVAILABLE:
1013
1053
  MINI_MODEL_SETUPS["mini_glm4"] = MiniModelConfig(
1014
1054
  liger_kernel_patch_func=apply_liger_kernel_to_glm4,
@@ -1351,7 +1391,7 @@ if HUNYUAN_V1_AVAILABLE:
1351
1391
  liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_moe,
1352
1392
  liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1_moe,
1353
1393
  model_class=HunYuanMoEV1ForCausalLM,
1354
- mini_model_config = HunYuanMoEV1Config(
1394
+ mini_model_config=HunYuanMoEV1Config(
1355
1395
  vocab_size=32000,
1356
1396
  hidden_size=128,
1357
1397
  intermediate_size=512,
@@ -1751,6 +1791,25 @@ def run_mini_model(
1751
1791
  ),
1752
1792
  ],
1753
1793
  ),
1794
+ pytest.param(
1795
+ "mini_olmo3",
1796
+ 32,
1797
+ 1e-5,
1798
+ torch.bfloat16,
1799
+ 1e-2,
1800
+ 1e-2,
1801
+ 1e-1,
1802
+ 1e-2,
1803
+ 1e-2,
1804
+ 1e-2,
1805
+ marks=[
1806
+ pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
1807
+ pytest.mark.skipif(
1808
+ not OLMO3_AVAILABLE,
1809
+ reason="OLMO3 not available in this version of transformers",
1810
+ ),
1811
+ ],
1812
+ ),
1754
1813
  pytest.param(
1755
1814
  "mini_glm4",
1756
1815
  32,
@@ -40,6 +40,7 @@ from liger_kernel.transformers import apply_liger_kernel_to_mistral
40
40
  from liger_kernel.transformers import apply_liger_kernel_to_mixtral
41
41
  from liger_kernel.transformers import apply_liger_kernel_to_mllama
42
42
  from liger_kernel.transformers import apply_liger_kernel_to_olmo2
43
+ from liger_kernel.transformers import apply_liger_kernel_to_olmo3
43
44
  from liger_kernel.transformers import apply_liger_kernel_to_phi3
44
45
  from liger_kernel.transformers import apply_liger_kernel_to_qwen2
45
46
  from liger_kernel.transformers import apply_liger_kernel_to_qwen2_5_vl
@@ -74,6 +75,7 @@ from test.utils import revert_liger_kernel_to_mistral
74
75
  from test.utils import revert_liger_kernel_to_mixtral
75
76
  from test.utils import revert_liger_kernel_to_mllama
76
77
  from test.utils import revert_liger_kernel_to_olmo2
78
+ from test.utils import revert_liger_kernel_to_olmo3
77
79
  from test.utils import revert_liger_kernel_to_phi3
78
80
  from test.utils import revert_liger_kernel_to_qwen2
79
81
  from test.utils import revert_liger_kernel_to_qwen2_5_vl
@@ -186,6 +188,15 @@ try:
186
188
  except ImportError:
187
189
  OLMO2_AVAILABLE = False
188
190
 
191
+ try:
192
+ # OLMO3 is only available in transformers>=4.57.0
193
+ from transformers.models.olmo3.configuration_olmo3 import Olmo3Config
194
+ from transformers.models.olmo3.modeling_olmo3 import Olmo3ForCausalLM
195
+
196
+ OLMO3_AVAILABLE = True
197
+ except ImportError:
198
+ OLMO3_AVAILABLE = False
199
+
189
200
  try:
190
201
  # Glm4 is only available in transformers>=4.51.3
191
202
  from transformers.models.glm4.configuration_glm4 import Glm4Config
@@ -1005,6 +1016,35 @@ if OLMO2_AVAILABLE:
1005
1016
  ),
1006
1017
  )
1007
1018
 
1019
+ if OLMO3_AVAILABLE:
1020
+ MINI_MODEL_SETUPS["mini_olmo3"] = MiniModelConfig(
1021
+ liger_kernel_patch_func=apply_liger_kernel_to_olmo3,
1022
+ liger_kernel_patch_revert_func=revert_liger_kernel_to_olmo3,
1023
+ model_class=Olmo3ForCausalLM,
1024
+ mini_model_config=Olmo3Config(
1025
+ bos_token_id=1, # 128000
1026
+ eos_token_id=2, # 128001
1027
+ pad_token_id=2,
1028
+ cross_attention_layers=None,
1029
+ dropout=0,
1030
+ hidden_act="silu",
1031
+ hidden_size=1024, # 4096
1032
+ initializer_range=0.02,
1033
+ intermediate_size=2048, # 14336
1034
+ max_position_embeddings=4096,
1035
+ num_attention_heads=8, # 32
1036
+ num_hidden_layers=4, # 40
1037
+ num_key_value_heads=2, # 8
1038
+ rms_norm_eps=1e-5,
1039
+ rope_scaling=None,
1040
+ rope_theta=500_000,
1041
+ tie_word_embeddings=False,
1042
+ use_cache=True,
1043
+ vocab_size=32000, # 128256,
1044
+ attn_implementation="sdpa", # default value, pytorch native attention
1045
+ ),
1046
+ )
1047
+
1008
1048
  if GLM4_AVAILABLE:
1009
1049
  MINI_MODEL_SETUPS["mini_glm4"] = MiniModelConfig(
1010
1050
  liger_kernel_patch_func=apply_liger_kernel_to_glm4,
@@ -1346,7 +1386,7 @@ if HUNYUAN_V1_AVAILABLE:
1346
1386
  liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_moe,
1347
1387
  liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1_moe,
1348
1388
  model_class=HunYuanMoEV1ForCausalLM,
1349
- mini_model_config = HunYuanMoEV1Config(
1389
+ mini_model_config=HunYuanMoEV1Config(
1350
1390
  vocab_size=32000,
1351
1391
  hidden_size=128,
1352
1392
  intermediate_size=512,
@@ -1768,6 +1808,25 @@ def run_mini_model(
1768
1808
  ),
1769
1809
  ],
1770
1810
  ),
1811
+ pytest.param(
1812
+ "mini_olmo3",
1813
+ 32,
1814
+ 1e-4,
1815
+ torch.bfloat16,
1816
+ 1e-2,
1817
+ 1e-2,
1818
+ 1e-1,
1819
+ 1e-2,
1820
+ 1e-2,
1821
+ 1e-2,
1822
+ marks=[
1823
+ pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
1824
+ pytest.mark.skipif(
1825
+ not OLMO3_AVAILABLE,
1826
+ reason="OLMO3 not available in this version of transformers",
1827
+ ),
1828
+ ],
1829
+ ),
1771
1830
  pytest.param(
1772
1831
  "mini_glm4",
1773
1832
  32,
@@ -40,6 +40,7 @@ from liger_kernel.transformers import apply_liger_kernel_to_mistral
40
40
  from liger_kernel.transformers import apply_liger_kernel_to_mixtral
41
41
  from liger_kernel.transformers import apply_liger_kernel_to_mllama
42
42
  from liger_kernel.transformers import apply_liger_kernel_to_olmo2
43
+ from liger_kernel.transformers import apply_liger_kernel_to_olmo3
43
44
  from liger_kernel.transformers import apply_liger_kernel_to_phi3
44
45
  from liger_kernel.transformers import apply_liger_kernel_to_qwen2
45
46
  from liger_kernel.transformers import apply_liger_kernel_to_qwen2_5_vl
@@ -74,6 +75,7 @@ from test.utils import revert_liger_kernel_to_mistral
74
75
  from test.utils import revert_liger_kernel_to_mixtral
75
76
  from test.utils import revert_liger_kernel_to_mllama
76
77
  from test.utils import revert_liger_kernel_to_olmo2
78
+ from test.utils import revert_liger_kernel_to_olmo3
77
79
  from test.utils import revert_liger_kernel_to_phi3
78
80
  from test.utils import revert_liger_kernel_to_qwen2
79
81
  from test.utils import revert_liger_kernel_to_qwen2_5_vl
@@ -182,6 +184,15 @@ try:
182
184
  except ImportError:
183
185
  OLMO2_AVAILABLE = False
184
186
 
187
+ try:
188
+ # OLMO3 is only available in transformers>=4.57.0
189
+ from transformers.models.olmo3.configuration_olmo3 import Olmo3Config
190
+ from transformers.models.olmo3.modeling_olmo3 import Olmo3ForCausalLM
191
+
192
+ OLMO3_AVAILABLE = True
193
+ except ImportError:
194
+ OLMO3_AVAILABLE = False
195
+
185
196
  try:
186
197
  # Glm4 is only available in transformers>=4.51.3
187
198
  from transformers.models.glm4.configuration_glm4 import Glm4Config
@@ -945,6 +956,35 @@ if OLMO2_AVAILABLE:
945
956
  ),
946
957
  )
947
958
 
959
+ if OLMO3_AVAILABLE:
960
+ MINI_MODEL_SETUPS["mini_olmo3"] = MiniModelConfig(
961
+ liger_kernel_patch_func=apply_liger_kernel_to_olmo3,
962
+ liger_kernel_patch_revert_func=revert_liger_kernel_to_olmo3,
963
+ model_class=Olmo3ForCausalLM,
964
+ mini_model_config=Olmo3Config(
965
+ bos_token_id=1, # 128000
966
+ eos_token_id=2, # 128001
967
+ pad_token_id=2,
968
+ cross_attention_layers=None,
969
+ dropout=0,
970
+ hidden_act="silu",
971
+ hidden_size=1024, # 4096
972
+ initializer_range=0.02,
973
+ intermediate_size=2048, # 14336
974
+ max_position_embeddings=4096,
975
+ num_attention_heads=8, # 32
976
+ num_hidden_layers=4, # 40
977
+ num_key_value_heads=2, # 8
978
+ rms_norm_eps=1e-5,
979
+ rope_scaling=None,
980
+ rope_theta=500_000,
981
+ tie_word_embeddings=False,
982
+ use_cache=True,
983
+ vocab_size=32000, # 128256,
984
+ attn_implementation="sdpa", # default value, pytorch native attention
985
+ ),
986
+ )
987
+
948
988
  if GLM4_AVAILABLE:
949
989
  MINI_MODEL_SETUPS["mini_glm4"] = MiniModelConfig(
950
990
  liger_kernel_patch_func=apply_liger_kernel_to_glm4,
@@ -1343,7 +1383,7 @@ if HUNYUAN_V1_AVAILABLE:
1343
1383
  liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_moe,
1344
1384
  liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1_moe,
1345
1385
  model_class=HunYuanMoEV1ForCausalLM,
1346
- mini_model_config = HunYuanMoEV1Config(
1386
+ mini_model_config=HunYuanMoEV1Config(
1347
1387
  hidden_act="silu",
1348
1388
  attention_dropout=0.0,
1349
1389
  num_hidden_layers=4,
@@ -1649,6 +1689,22 @@ def run_mini_model(
1649
1689
  reason="OLMO2 not available in this version of transformers",
1650
1690
  ),
1651
1691
  ),
1692
+ pytest.param(
1693
+ "mini_olmo3",
1694
+ 32,
1695
+ 1e-4,
1696
+ torch.float32,
1697
+ 1e-8,
1698
+ 1e-5,
1699
+ 5e-3,
1700
+ 1e-5,
1701
+ 5e-3,
1702
+ 1e-5,
1703
+ marks=pytest.mark.skipif(
1704
+ not OLMO3_AVAILABLE,
1705
+ reason="OLMO3 not available in this version of transformers",
1706
+ ),
1707
+ ),
1652
1708
  pytest.param(
1653
1709
  "mini_glm4",
1654
1710
  32,