liger-kernel-nightly 0.5.10.dev20250624183504__tar.gz → 0.5.10.dev20250629005644__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (271) hide show
  1. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/PKG-INFO +2 -1
  2. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/README.md +1 -0
  3. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/dev/modal/benchmarks.py +2 -1
  4. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/pyproject.toml +1 -1
  5. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/__init__.py +3 -0
  6. liger_kernel_nightly-0.5.10.dev20250629005644/src/liger_kernel/transformers/model/llama4.py +108 -0
  7. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/monkey_patch.py +88 -0
  8. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel_nightly.egg-info/PKG-INFO +2 -1
  9. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel_nightly.egg-info/SOURCES.txt +2 -0
  10. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/convergence/bf16/test_mini_models.py +50 -3
  11. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/convergence/bf16/test_mini_models_multimodal.py +131 -11
  12. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/convergence/bf16/test_mini_models_with_logits.py +48 -1
  13. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/convergence/fp32/test_mini_models.py +46 -1
  14. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/convergence/fp32/test_mini_models_multimodal.py +132 -10
  15. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/convergence/fp32/test_mini_models_with_logits.py +46 -1
  16. liger_kernel_nightly-0.5.10.dev20250629005644/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +98 -0
  17. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_monkey_patch.py +131 -0
  18. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/utils.py +23 -0
  19. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  20. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  21. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/.github/pull_request_template.md +0 -0
  22. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/.github/workflows/amd-ci.yml +0 -0
  23. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/.github/workflows/benchmark.yml +0 -0
  24. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/.github/workflows/docs.yml +0 -0
  25. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/.github/workflows/intel-ci.yml +0 -0
  26. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/.github/workflows/nvi-ci.yml +0 -0
  27. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/.github/workflows/publish-nightly.yml +0 -0
  28. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/.github/workflows/publish-release.yml +0 -0
  29. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/.gitignore +0 -0
  30. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/.idea/workspace.xml +0 -0
  31. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/LICENSE +0 -0
  32. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/Makefile +0 -0
  33. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/NOTICE +0 -0
  34. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/README.md +0 -0
  35. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/__init__.py +0 -0
  36. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/benchmarks_visualizer.py +0 -0
  37. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/data/all_benchmark_data.csv +0 -0
  38. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/__init__.py +0 -0
  39. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
  40. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
  41. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
  42. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
  43. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_dyt.py +0 -0
  44. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_embedding.py +0 -0
  45. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
  46. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
  47. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
  48. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_geglu.py +0 -0
  49. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_group_norm.py +0 -0
  50. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_jsd.py +0 -0
  51. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_kl_div.py +0 -0
  52. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_kto_loss.py +0 -0
  53. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_layer_norm.py +0 -0
  54. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
  55. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
  56. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
  57. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_rms_norm.py +0 -0
  58. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_rope.py +0 -0
  59. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
  60. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_softmax.py +0 -0
  61. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
  62. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_sparsemax.py +0 -0
  63. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_swiglu.py +0 -0
  64. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/benchmark_tvd.py +0 -0
  65. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/benchmark/scripts/utils.py +0 -0
  66. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/dev/fmt-requirements.txt +0 -0
  67. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/dev/modal/tests.py +0 -0
  68. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/dev/modal/tests_bwd.py +0 -0
  69. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/Examples.md +0 -0
  70. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/Getting-Started.md +0 -0
  71. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/High-Level-APIs.md +0 -0
  72. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/Low-Level-APIs.md +0 -0
  73. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/acknowledgement.md +0 -0
  74. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/contributing.md +0 -0
  75. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/images/banner.GIF +0 -0
  76. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/images/compose.gif +0 -0
  77. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/images/e2e-memory.png +0 -0
  78. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/images/e2e-tps.png +0 -0
  79. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/images/logo-banner.png +0 -0
  80. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/images/patch.gif +0 -0
  81. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/images/post-training.png +0 -0
  82. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/index.md +0 -0
  83. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/docs/license.md +0 -0
  84. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/alignment/accelerate_config.yaml +0 -0
  85. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/alignment/run_orpo.py +0 -0
  86. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/README.md +0 -0
  87. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/callback.py +0 -0
  88. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/config/fsdp_config.json +0 -0
  89. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/img/gemma_7b_mem.png +0 -0
  90. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/img/gemma_7b_tp.png +0 -0
  91. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/img/llama_mem_alloc.png +0 -0
  92. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/img/llama_tps.png +0 -0
  93. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
  94. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/img/qwen_tps.png +0 -0
  95. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/launch_on_modal.py +0 -0
  96. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/requirements.txt +0 -0
  97. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/run_benchmarks.sh +0 -0
  98. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/run_gemma.sh +0 -0
  99. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/run_llama.sh +0 -0
  100. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/run_qwen.sh +0 -0
  101. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/run_qwen2_vl.sh +0 -0
  102. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/training.py +0 -0
  103. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/huggingface/training_multimodal.py +0 -0
  104. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/lightning/README.md +0 -0
  105. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/lightning/requirements.txt +0 -0
  106. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/lightning/training.py +0 -0
  107. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/README.md +0 -0
  108. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/callback.py +0 -0
  109. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
  110. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
  111. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
  112. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
  113. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
  114. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
  115. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
  116. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
  117. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
  118. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/medusa_util.py +0 -0
  119. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/requirements.txt +0 -0
  120. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
  121. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/examples/medusa/train.py +0 -0
  122. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/licenses/LICENSE-Apache-2.0 +0 -0
  123. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/licenses/LICENSE-MIT-AutoAWQ +0 -0
  124. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
  125. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/licenses/LICENSE-MIT-llmc +0 -0
  126. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/licenses/LICENSE-MIT-triton +0 -0
  127. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/mkdocs.yml +0 -0
  128. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/setup.cfg +0 -0
  129. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/setup.py +0 -0
  130. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/__init__.py +0 -0
  131. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/chunked_loss/README.md +0 -0
  132. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/chunked_loss/__init__.py +0 -0
  133. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
  134. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
  135. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/chunked_loss/functional.py +0 -0
  136. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
  137. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
  138. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
  139. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
  140. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
  141. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
  142. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
  143. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
  144. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
  145. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/env_report.py +0 -0
  146. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/__init__.py +0 -0
  147. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/cross_entropy.py +0 -0
  148. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/dyt.py +0 -0
  149. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  150. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
  151. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
  152. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
  153. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
  154. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/geglu.py +0 -0
  155. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/group_norm.py +0 -0
  156. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/grpo_loss.py +0 -0
  157. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/jsd.py +0 -0
  158. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/kl_div.py +0 -0
  159. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/layer_norm.py +0 -0
  160. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/multi_token_attention.py +0 -0
  161. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
  162. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/rms_norm.py +0 -0
  163. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/rope.py +0 -0
  164. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/softmax.py +0 -0
  165. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/sparsemax.py +0 -0
  166. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/swiglu.py +0 -0
  167. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/tvd.py +0 -0
  168. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/ops/utils.py +0 -0
  169. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/auto_model.py +0 -0
  170. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/cross_entropy.py +0 -0
  171. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/dyt.py +0 -0
  172. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  173. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/fsdp.py +0 -0
  174. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/functional.py +0 -0
  175. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
  176. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
  177. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
  178. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/geglu.py +0 -0
  179. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/group_norm.py +0 -0
  180. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/grpo_loss.py +0 -0
  181. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/jsd.py +0 -0
  182. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/kl_div.py +0 -0
  183. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/layer_norm.py +0 -0
  184. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/__init__.py +0 -0
  185. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/gemma.py +0 -0
  186. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/gemma2.py +0 -0
  187. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/gemma3.py +0 -0
  188. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/glm4.py +0 -0
  189. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/llama.py +0 -0
  190. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/llava.py +0 -0
  191. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
  192. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/mistral.py +0 -0
  193. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/mixtral.py +0 -0
  194. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/mllama.py +0 -0
  195. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/olmo2.py +0 -0
  196. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/paligemma.py +0 -0
  197. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/phi3.py +0 -0
  198. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/qwen2.py +0 -0
  199. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
  200. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
  201. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/qwen3.py +0 -0
  202. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
  203. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
  204. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
  205. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/rms_norm.py +0 -0
  206. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/rope.py +0 -0
  207. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/softmax.py +0 -0
  208. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/sparsemax.py +0 -0
  209. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/swiglu.py +0 -0
  210. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
  211. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
  212. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  213. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/transformers/tvd.py +0 -0
  214. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/triton/__init__.py +0 -0
  215. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/triton/monkey_patch.py +0 -0
  216. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel/utils.py +0 -0
  217. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
  218. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
  219. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
  220. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/__init__.py +0 -0
  221. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/chunked_loss/__init__.py +0 -0
  222. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/chunked_loss/test_cpo_loss.py +0 -0
  223. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/chunked_loss/test_dpo_loss.py +0 -0
  224. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/chunked_loss/test_grpo_loss.py +0 -0
  225. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/chunked_loss/test_jsd_loss.py +0 -0
  226. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/chunked_loss/test_kto_loss.py +0 -0
  227. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/chunked_loss/test_orpo_loss.py +0 -0
  228. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/chunked_loss/test_simpo_loss.py +0 -0
  229. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/conftest.py +0 -0
  230. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/convergence/__init__.py +0 -0
  231. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/convergence/bf16/__init__.py +0 -0
  232. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/convergence/fp32/__init__.py +0 -0
  233. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
  234. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
  235. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
  236. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
  237. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
  238. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
  239. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
  240. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
  241. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
  242. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/resources/tiny_shakespeare.txt +0 -0
  243. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
  244. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
  245. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
  246. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_auto_model.py +0 -0
  247. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_cross_entropy.py +0 -0
  248. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_dyt.py +0 -0
  249. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_embedding.py +0 -0
  250. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_flex_attention.py +0 -0
  251. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
  252. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_fused_linear_jsd.py +0 -0
  253. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_fused_neighborhood_attention.py +0 -0
  254. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_geglu.py +0 -0
  255. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_group_norm.py +0 -0
  256. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_grpo_loss.py +0 -0
  257. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_jsd.py +0 -0
  258. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_kl_div.py +0 -0
  259. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_layer_norm.py +0 -0
  260. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_mm_int8int2.py +0 -0
  261. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_multi_token_attention.py +0 -0
  262. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_qwen2vl_mrope.py +0 -0
  263. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_rms_norm.py +0 -0
  264. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_rope.py +0 -0
  265. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_softmax.py +0 -0
  266. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_sparsemax.py +0 -0
  267. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_swiglu.py +0 -0
  268. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_trainer_integration.py +0 -0
  269. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_transformers.py +0 -0
  270. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/transformers/test_tvd.py +0 -0
  271. {liger_kernel_nightly-0.5.10.dev20250624183504 → liger_kernel_nightly-0.5.10.dev20250629005644}/test/triton/test_triton_monkey_patch.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.5.10.dev20250624183504
3
+ Version: 0.5.10.dev20250629005644
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -290,6 +290,7 @@ loss.backward()
290
290
 
291
291
  | **Model** | **API** | **Supported Operations** |
292
292
  |-------------|--------------------------------------------------------------|-------------------------------------------------------------------------|
293
+ | Llama4 (Text) & (Multimodal) | `liger_kernel.transformers.apply_liger_kernel_to_llama4` | RMSNorm, LayerNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
293
294
  | LLaMA 2 & 3 | `liger_kernel.transformers.apply_liger_kernel_to_llama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
294
295
  | LLaMA 3.2-Vision | `liger_kernel.transformers.apply_liger_kernel_to_mllama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
295
296
  | Mistral | `liger_kernel.transformers.apply_liger_kernel_to_mistral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -241,6 +241,7 @@ loss.backward()
241
241
 
242
242
  | **Model** | **API** | **Supported Operations** |
243
243
  |-------------|--------------------------------------------------------------|-------------------------------------------------------------------------|
244
+ | Llama4 (Text) & (Multimodal) | `liger_kernel.transformers.apply_liger_kernel_to_llama4` | RMSNorm, LayerNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
244
245
  | LLaMA 2 & 3 | `liger_kernel.transformers.apply_liger_kernel_to_llama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
245
246
  | LLaMA 3.2-Vision | `liger_kernel.transformers.apply_liger_kernel_to_mllama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
246
247
  | Mistral | `liger_kernel.transformers.apply_liger_kernel_to_mistral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -14,7 +14,8 @@ app = modal.App("liger_benchmarks", image=image)
14
14
  repo = image.add_local_dir(ROOT_PATH, remote_path=REMOTE_ROOT_PATH)
15
15
 
16
16
 
17
- @app.function(gpu=["100"], image=repo, timeout=60 * 90)
17
+ @app.function(gpu=["H100"], image=repo, timeout=60 * 90)
18
+
18
19
  def liger_benchmarks():
19
20
  import os
20
21
  import subprocess
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "liger_kernel_nightly"
7
- version = "0.5.10.dev20250624183504"
7
+ version = "0.5.10.dev20250629005644"
8
8
  description = "Efficient Triton kernels for LLM Training"
9
9
  urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -30,6 +30,7 @@ if TYPE_CHECKING:
30
30
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4 # noqa: F401
31
31
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_granite # noqa: F401
32
32
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama # noqa: F401
33
+ from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama4 # noqa: F401
33
34
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llava # noqa: F401
34
35
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_mistral # noqa: F401
35
36
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_mixtral # noqa: F401
@@ -87,6 +88,7 @@ def __getattr__(name: str):
87
88
  "apply_liger_kernel_to_granite",
88
89
  "apply_liger_kernel_to_llama",
89
90
  "apply_liger_kernel_to_llava",
91
+ "apply_liger_kernel_to_llama4",
90
92
  "apply_liger_kernel_to_mistral",
91
93
  "apply_liger_kernel_to_mixtral",
92
94
  "apply_liger_kernel_to_mllama",
@@ -141,6 +143,7 @@ if _TRANSFORMERS_AVAILABLE:
141
143
  "apply_liger_kernel_to_granite",
142
144
  "apply_liger_kernel_to_llama",
143
145
  "apply_liger_kernel_to_llava",
146
+ "apply_liger_kernel_to_llama4",
144
147
  "apply_liger_kernel_to_mistral",
145
148
  "apply_liger_kernel_to_mixtral",
146
149
  "apply_liger_kernel_to_mllama",
@@ -0,0 +1,108 @@
1
+ from typing import List
2
+ from typing import Optional
3
+ from typing import Tuple
4
+ from typing import Union
5
+
6
+ import torch
7
+
8
+ from transformers.cache_utils import Cache
9
+ from transformers.modeling_outputs import CausalLMOutputWithPast
10
+
11
+ from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
12
+
13
+
14
+ def lce_forward(
15
+ self,
16
+ input_ids: torch.LongTensor = None,
17
+ attention_mask: Optional[torch.Tensor] = None,
18
+ position_ids: Optional[torch.LongTensor] = None,
19
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
20
+ inputs_embeds: Optional[torch.FloatTensor] = None,
21
+ labels: Optional[torch.LongTensor] = None,
22
+ use_cache: Optional[bool] = None,
23
+ output_attentions: Optional[bool] = None,
24
+ output_hidden_states: Optional[bool] = None,
25
+ return_dict: Optional[bool] = None,
26
+ cache_position: Optional[torch.LongTensor] = None,
27
+ logits_to_keep: Union[int, torch.Tensor] = 0,
28
+ **kwargs,
29
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
30
+ r"""
31
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
32
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
33
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
34
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
35
+
36
+ Example:
37
+
38
+ ```python
39
+ >>> from transformers import AutoTokenizer, Llama4ForCausalLM
40
+
41
+ >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
42
+ >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")
43
+
44
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
45
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
46
+
47
+ >>> # Generate
48
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
49
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
50
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
51
+ ```"""
52
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
53
+ output_hidden_states = (
54
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
55
+ )
56
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
57
+
58
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
59
+ outputs = self.model(
60
+ input_ids=input_ids,
61
+ attention_mask=attention_mask,
62
+ position_ids=position_ids,
63
+ past_key_values=past_key_values,
64
+ inputs_embeds=inputs_embeds,
65
+ use_cache=use_cache,
66
+ output_attentions=output_attentions,
67
+ output_hidden_states=output_hidden_states,
68
+ return_dict=True,
69
+ cache_position=cache_position,
70
+ **kwargs,
71
+ )
72
+
73
+ hidden_states = outputs[0]
74
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
75
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
76
+ kept_hidden_states = hidden_states[:, slice_indices, :]
77
+
78
+ shift_labels = kwargs.pop("shift_labels", None)
79
+ logits = None
80
+ loss = None
81
+
82
+ if self.training and (labels is not None or shift_labels is not None):
83
+ loss = LigerForCausalLMLoss(
84
+ hidden_states=kept_hidden_states,
85
+ lm_head_weight=self.lm_head.weight,
86
+ labels=labels,
87
+ shift_labels=shift_labels,
88
+ hidden_size=self.config.hidden_size,
89
+ **kwargs,
90
+ )
91
+
92
+ else: # if in inference mode materialize logits
93
+ logits = self.lm_head(kept_hidden_states)
94
+ if labels is not None:
95
+ loss = self.loss_function(
96
+ logits=logits,
97
+ labels=labels,
98
+ vocab_size=self.config.vocab_size,
99
+ **kwargs,
100
+ )
101
+
102
+ return CausalLMOutputWithPast(
103
+ loss=loss,
104
+ logits=logits,
105
+ past_key_values=outputs.past_key_values,
106
+ hidden_states=outputs.hidden_states,
107
+ attentions=outputs.attentions,
108
+ )
@@ -363,6 +363,92 @@ def apply_liger_kernel_to_llava(
363
363
  logger.warning(f"{vision_model_name} is not supported by Liger kernel.")
364
364
 
365
365
 
366
+ def apply_liger_kernel_to_llama4(
367
+ rope: bool = False,
368
+ cross_entropy: bool = False,
369
+ fused_linear_cross_entropy: bool = True,
370
+ rms_norm: bool = True,
371
+ swiglu: bool = True,
372
+ model: PreTrainedModel = None,
373
+ layer_norm: bool = True,
374
+ ) -> None:
375
+ """
376
+ Apply Liger kernels to replace original implementation in HuggingFace Llama4 models.
377
+
378
+ Args:
379
+ rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
380
+ cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
381
+ fused_linear_cross_entropy (bool):
382
+ Whether to apply Liger's fused linear cross entropy loss. Default is True.
383
+ `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
384
+ If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
385
+ rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
386
+ swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
387
+ model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
388
+ loaded. Default is None.
389
+ """
390
+ assert not (cross_entropy and fused_linear_cross_entropy), (
391
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
392
+ )
393
+
394
+ from transformers.models.llama4 import modeling_llama4
395
+ from transformers.models.llama4.modeling_llama4 import Llama4ForCausalLM
396
+ from transformers.models.llama4.modeling_llama4 import Llama4ForConditionalGeneration
397
+ from transformers.models.llama4.modeling_llama4 import Llama4TextModel
398
+ from transformers.models.llama4.modeling_llama4 import Llama4VisionModel
399
+
400
+ from liger_kernel.transformers.model.llama4 import lce_forward as llama4_lce_forward
401
+
402
+ if rope:
403
+ raise NotImplementedError("liger_rotary_pos_emb is not available for Llama4 models.")
404
+ if rms_norm:
405
+ modeling_llama4.Llama4TextRMSNorm = LigerRMSNorm
406
+ if swiglu:
407
+ modeling_llama4.Llama4TextMLP = LigerSwiGLUMLP
408
+
409
+ if cross_entropy:
410
+ modeling_llama4.CrossEntropyLoss = LigerCrossEntropyLoss
411
+
412
+ if fused_linear_cross_entropy:
413
+ modeling_llama4.Llama4ForCausalLM.forward = llama4_lce_forward
414
+
415
+ if model is not None:
416
+ # The model instance already exists, so we need to additionally patch the
417
+ # instance variables that reference already-instantiated modules
418
+ if isinstance(model, Llama4ForConditionalGeneration):
419
+ language_model: Llama4ForCausalLM = model.language_model
420
+ vision_model: Llama4VisionModel = model.vision_model
421
+ text_model: Llama4TextModel = language_model.model
422
+ elif isinstance(model, Llama4ForCausalLM):
423
+ text_model = model.model
424
+ vision_model = None
425
+ elif isinstance(model, Llama4TextModel):
426
+ text_model = model
427
+ vision_model = None
428
+
429
+ else:
430
+ raise ValueError(f"Unsupported Llama4 model type: {type(model)}")
431
+
432
+ if text_model:
433
+ if rms_norm:
434
+ _patch_rms_norm_module(text_model.norm)
435
+ for decoder_layer in text_model.layers:
436
+ if swiglu:
437
+ _patch_swiglu_module(decoder_layer.feed_forward, LigerSwiGLUMLP)
438
+ if rms_norm:
439
+ _patch_rms_norm_module(decoder_layer.input_layernorm)
440
+ _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
441
+
442
+ if vision_model:
443
+ _patch_layer_norm_module(vision_model.layernorm_pre)
444
+ _patch_layer_norm_module(vision_model.layernorm_post)
445
+
446
+ for layer in vision_model.model.layers:
447
+ if layer_norm:
448
+ _patch_layer_norm_module(layer.input_layernorm)
449
+ _patch_layer_norm_module(layer.post_attention_layernorm)
450
+
451
+
366
452
  def apply_liger_kernel_to_mllama(
367
453
  rope: bool = True,
368
454
  cross_entropy: bool = False,
@@ -1605,6 +1691,8 @@ MODEL_TYPE_TO_APPLY_LIGER_FN = {
1605
1691
  "gemma3": apply_liger_kernel_to_gemma3,
1606
1692
  "glm4": apply_liger_kernel_to_glm4,
1607
1693
  "llama": apply_liger_kernel_to_llama,
1694
+ "llama4_text": apply_liger_kernel_to_llama4,
1695
+ "llama4": apply_liger_kernel_to_llama4,
1608
1696
  "llava": apply_liger_kernel_to_llava,
1609
1697
  "granite": apply_liger_kernel_to_granite,
1610
1698
  "mllama": apply_liger_kernel_to_mllama,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.5.10.dev20250624183504
3
+ Version: 0.5.10.dev20250629005644
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -290,6 +290,7 @@ loss.backward()
290
290
 
291
291
  | **Model** | **API** | **Supported Operations** |
292
292
  |-------------|--------------------------------------------------------------|-------------------------------------------------------------------------|
293
+ | Llama4 (Text) & (Multimodal) | `liger_kernel.transformers.apply_liger_kernel_to_llama4` | RMSNorm, LayerNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
293
294
  | LLaMA 2 & 3 | `liger_kernel.transformers.apply_liger_kernel_to_llama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
294
295
  | LLaMA 3.2-Vision | `liger_kernel.transformers.apply_liger_kernel_to_mllama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
295
296
  | Mistral | `liger_kernel.transformers.apply_liger_kernel_to_mistral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -183,6 +183,7 @@ src/liger_kernel/transformers/model/gemma2.py
183
183
  src/liger_kernel/transformers/model/gemma3.py
184
184
  src/liger_kernel/transformers/model/glm4.py
185
185
  src/liger_kernel/transformers/model/llama.py
186
+ src/liger_kernel/transformers/model/llama4.py
186
187
  src/liger_kernel/transformers/model/llava.py
187
188
  src/liger_kernel/transformers/model/loss_utils.py
188
189
  src/liger_kernel/transformers/model/mistral.py
@@ -234,6 +235,7 @@ test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json
234
235
  test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json
235
236
  test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json
236
237
  test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json
238
+ test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json
237
239
  test/resources/scripts/generate_tokenized_dataset.py
238
240
  test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow
239
241
  test/resources/tiny_shakespeare_tokenized/dataset_info.json
@@ -9,6 +9,8 @@ from transformers.models.gemma2 import Gemma2Config
9
9
  from transformers.models.gemma2 import Gemma2ForCausalLM
10
10
  from transformers.models.llama import LlamaConfig
11
11
  from transformers.models.llama import LlamaForCausalLM
12
+ from transformers.models.llama4 import Llama4ForCausalLM
13
+ from transformers.models.llama4.configuration_llama4 import Llama4TextConfig
12
14
  from transformers.models.mistral import MistralConfig
13
15
  from transformers.models.mistral import MistralForCausalLM
14
16
  from transformers.models.mixtral import MixtralConfig
@@ -24,6 +26,7 @@ from liger_kernel.transformers import apply_liger_kernel_to_gemma3_text
24
26
  from liger_kernel.transformers import apply_liger_kernel_to_glm4
25
27
  from liger_kernel.transformers import apply_liger_kernel_to_granite
26
28
  from liger_kernel.transformers import apply_liger_kernel_to_llama
29
+ from liger_kernel.transformers import apply_liger_kernel_to_llama4
27
30
  from liger_kernel.transformers import apply_liger_kernel_to_llava
28
31
  from liger_kernel.transformers import apply_liger_kernel_to_mistral
29
32
  from liger_kernel.transformers import apply_liger_kernel_to_mixtral
@@ -46,6 +49,7 @@ from test.utils import revert_liger_kernel_to_gemma3_text
46
49
  from test.utils import revert_liger_kernel_to_glm4
47
50
  from test.utils import revert_liger_kernel_to_granite
48
51
  from test.utils import revert_liger_kernel_to_llama
52
+ from test.utils import revert_liger_kernel_to_llama4
49
53
  from test.utils import revert_liger_kernel_to_llava
50
54
  from test.utils import revert_liger_kernel_to_mistral
51
55
  from test.utils import revert_liger_kernel_to_mixtral
@@ -152,6 +156,35 @@ from liger_kernel.utils import infer_device
152
156
  device = infer_device()
153
157
 
154
158
  MINI_MODEL_SETUPS = {
159
+ "mini_llama4": MiniModelConfig(
160
+ liger_kernel_patch_func=apply_liger_kernel_to_llama4,
161
+ liger_kernel_patch_revert_func=revert_liger_kernel_to_llama4,
162
+ model_class=Llama4ForCausalLM,
163
+ mini_model_config=Llama4TextConfig(
164
+ bos_token_id=1, # None
165
+ eos_token_id=2, # 151329, 151336, 151338
166
+ pad_token_id=2, # 151329
167
+ partial_rotary_factor=1.0,
168
+ cross_attention_layers=None,
169
+ dropout=0,
170
+ hidden_act="silu",
171
+ hidden_size=1024, # 6144
172
+ initializer_range=0.02,
173
+ intermediate_size=2048, # 14336
174
+ max_position_embeddings=4096, # 32768
175
+ num_attention_heads=8, # 48
176
+ num_hidden_layers=4, # 61
177
+ num_key_value_heads=2,
178
+ rms_norm_eps=1e-5,
179
+ rope_scaling=None,
180
+ rope_theta=10000.0,
181
+ tie_word_embeddings=False,
182
+ use_cache=True,
183
+ vocab_size=32000, # 151552
184
+ attention_bias=True,
185
+ attn_implementation="sdpa", # default value, pytorch native attention
186
+ ),
187
+ ),
155
188
  "mini_llama3": MiniModelConfig(
156
189
  liger_kernel_patch_func=apply_liger_kernel_to_llama,
157
190
  liger_kernel_patch_revert_func=revert_liger_kernel_to_llama,
@@ -380,6 +413,7 @@ MINI_MODEL_SETUPS = {
380
413
  ),
381
414
  }
382
415
 
416
+
383
417
  if QWEN3_AVAILABLE:
384
418
  MINI_MODEL_SETUPS["mini_qwen3"] = MiniModelConfig(
385
419
  liger_kernel_patch_func=apply_liger_kernel_to_qwen3,
@@ -770,7 +804,7 @@ if GLM4_AVAILABLE:
770
804
  )
771
805
 
772
806
 
773
- def create_model(model_name="mini_llama3"):
807
+ def create_model(model_name="mini_llama4"):
774
808
  """
775
809
  Create a mini version model
776
810
  The commented values are the original values
@@ -781,7 +815,7 @@ def create_model(model_name="mini_llama3"):
781
815
 
782
816
 
783
817
  def run_mini_model(
784
- model_name="mini_llama3",
818
+ model_name="mini_llama4",
785
819
  num_steps=100,
786
820
  dtype=torch.bfloat16,
787
821
  lr=1e-5,
@@ -804,7 +838,7 @@ def run_mini_model(
804
838
  "rms_norm": True,
805
839
  }
806
840
 
807
- if "glm4" in model_name:
841
+ if "glm4" in model_name or "llama4" in model_name:
808
842
  kwargs["rope"] = False
809
843
 
810
844
  model_supports_layer_norm = "qwen2_vl" in model_name
@@ -865,6 +899,19 @@ def run_mini_model(
865
899
  @pytest.mark.parametrize(
866
900
  "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
867
901
  [
902
+ pytest.param(
903
+ "mini_llama4",
904
+ 32,
905
+ 1e-4,
906
+ torch.bfloat16,
907
+ 1e-3,
908
+ 1e-2,
909
+ 1e-1,
910
+ 1e-1,
911
+ 1e-2,
912
+ 1e-2,
913
+ marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
914
+ ),
868
915
  pytest.param(
869
916
  "mini_llama3",
870
917
  32,
@@ -11,6 +11,7 @@ from transformers.models.gemma.tokenization_gemma_fast import GemmaTokenizerFast
11
11
  from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
12
12
 
13
13
  from liger_kernel.transformers import apply_liger_kernel_to_gemma3
14
+ from liger_kernel.transformers import apply_liger_kernel_to_llama4
14
15
  from liger_kernel.transformers import apply_liger_kernel_to_llava
15
16
  from liger_kernel.transformers import apply_liger_kernel_to_mllama
16
17
  from liger_kernel.transformers import apply_liger_kernel_to_paligemma
@@ -28,6 +29,7 @@ from test.utils import load_processor_config
28
29
  from test.utils import load_tokenizer_config
29
30
  from test.utils import multimodal_collate_fn
30
31
  from test.utils import revert_liger_kernel_to_gemma3
32
+ from test.utils import revert_liger_kernel_to_llama4
31
33
  from test.utils import revert_liger_kernel_to_llava
32
34
  from test.utils import revert_liger_kernel_to_mllama
33
35
  from test.utils import revert_liger_kernel_to_Paligemma
@@ -126,6 +128,19 @@ try:
126
128
  except ImportError:
127
129
  GEMMA3_AVAILABLE = False
128
130
 
131
+ try:
132
+ from transformers.models.llama4.configuration_llama4 import Llama4Config
133
+ from transformers.models.llama4.configuration_llama4 import Llama4TextConfig
134
+ from transformers.models.llama4.configuration_llama4 import Llama4VisionConfig
135
+ from transformers.models.llama4.image_processing_llama4_fast import Llama4ImageProcessorFast
136
+ from transformers.models.llama4.modeling_llama4 import Llama4ForConditionalGeneration
137
+ from transformers.models.llama4.processing_llama4 import Llama4Processor
138
+
139
+ LLAMA4_AVAILABLE = True
140
+
141
+ except ImportError:
142
+ LLAMA4_AVAILABLE = False
143
+
129
144
  from liger_kernel.utils import infer_device
130
145
 
131
146
  device = infer_device()
@@ -144,6 +159,55 @@ TEST_IMAGE_DIM = 64
144
159
 
145
160
  MINI_MODEL_SETUPS = {}
146
161
 
162
+ if LLAMA4_AVAILABLE:
163
+ MINI_MODEL_SETUPS["mini_llama4"] = MiniModelConfig(
164
+ liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_llama4, fused_linear_cross_entropy=False),
165
+ liger_kernel_patch_revert_func=revert_liger_kernel_to_llama4,
166
+ model_class=Llama4ForConditionalGeneration,
167
+ mini_model_config=Llama4Config(
168
+ image_token_index=8,
169
+ vision_config=Llama4VisionConfig(
170
+ attn_implementation_autoset=True,
171
+ attention_dropout=0.0,
172
+ hidden_act="gelu",
173
+ hidden_size=512, # 1280
174
+ image_size=560, # 560
175
+ initializer_range=0.02,
176
+ intermediate_layers_indices=[2], # [3, 7, 15, etc...]
177
+ intermediate_size=2048, # 5120
178
+ max_num_tiles=1, # 4
179
+ norm_eps=1e-5,
180
+ num_attention_heads=4, # 16
181
+ num_channels=3,
182
+ num_global_layers=2, # 8
183
+ num_hidden_layers=8, # 32
184
+ patch_size=280, # 14
185
+ supported_aspect_ratios=[[1, 1]], # [[1, 1], [1, 2], etc... ]
186
+ vision_output_dim=4096, # 7680
187
+ ),
188
+ text_config=Llama4TextConfig(
189
+ bos_token_id=0,
190
+ eos_token_id=0,
191
+ pad_token_id=0,
192
+ cross_attention_layers=[2], # [3, 8, 13, 18, etc...]
193
+ dropout=0,
194
+ hidden_act="silu",
195
+ hidden_size=1024, # 4096
196
+ initializer_range=0.02,
197
+ intermediate_size=2048, # 14336
198
+ max_position_embeddings=131_072,
199
+ num_attention_heads=8, # 32
200
+ num_hidden_layers=4, # 40
201
+ num_key_value_heads=2, # 8
202
+ rms_norm_eps=1e-5,
203
+ rope_theta=500_000,
204
+ tie_word_embeddings=False,
205
+ use_cache=True,
206
+ vocab_size=32000, # 128256,
207
+ ),
208
+ attn_implementation="sdpa",
209
+ ),
210
+ )
147
211
 
148
212
  if MLLAMA_AVAILABLE:
149
213
  MINI_MODEL_SETUPS["mini_mllama"] = MiniModelConfig(
@@ -578,7 +642,30 @@ def create_processor(model_name: str):
578
642
  image_processor = CLIPImageProcessor(**image_processor_config)
579
643
 
580
644
  return LlavaProcessor(**processor_config, image_processor=image_processor, tokenizer=fast_tokenizer)
581
-
645
+ elif model_name.startswith("mini_llama4"):
646
+ tokenizer_config = load_tokenizer_config(
647
+ os.path.join(
648
+ FAKE_CONFIGS_PATH,
649
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json",
650
+ )
651
+ )
652
+ tokenizer_base = train_bpe_tokenizer(
653
+ [
654
+ token.content
655
+ for key, token in sorted(
656
+ tokenizer_config["added_tokens_decoder"].items(),
657
+ key=lambda x: int(x[0]),
658
+ )
659
+ ]
660
+ )
661
+ fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config)
662
+ image_processor = Llama4ImageProcessorFast(size={"height": 560, "width": 560})
663
+ return Llama4Processor(
664
+ image_processor=image_processor,
665
+ tokenizer=fast_tokenizer,
666
+ fake_image_token="<|image|>",
667
+ image_token="<|image|>",
668
+ )
582
669
  elif model_name == "mini_mllama":
583
670
  tokenizer_config = load_tokenizer_config(
584
671
  os.path.join(
@@ -678,14 +765,27 @@ def create_multimodal_dataset(model_name: str):
678
765
 
679
766
  def preprocess_function(examples):
680
767
  """Tokenize text, preprocess images, and generate other relevant inputs for the model."""
681
- return processor(
682
- text=examples["text"],
683
- images=examples["image"],
684
- padding="max_length",
685
- truncation=True,
686
- max_length=1024, # longer than for text-only b/c images require quite a few tokens
687
- return_tensors="pt",
688
- )
768
+ if model_name == "mini_llama4":
769
+ # Process images and text separately to avoid complex token replacement, this helped setting lower tolerance than processing them together.
770
+ image_inputs = processor.image_processor(images=examples["image"], return_tensors="pt")
771
+ text_inputs = processor.tokenizer(
772
+ examples["text"],
773
+ padding="max_length",
774
+ truncation=True,
775
+ max_length=1024,
776
+ return_tensors="pt",
777
+ )
778
+ return {**text_inputs, **image_inputs}
779
+ else:
780
+ # For other models, use the normal processor
781
+ return processor(
782
+ text=examples["text"],
783
+ images=examples["image"],
784
+ padding="max_length",
785
+ truncation=True,
786
+ max_length=1024, # longer than for text-only b/c images require quite a few tokens
787
+ return_tensors="pt",
788
+ )
689
789
 
690
790
  train_dataset = (
691
791
  load_dataset("text", data_files={"train": UNTOKENIZED_DATASET_PATH}, split="train")
@@ -722,7 +822,7 @@ def run_mini_model_multimodal(
722
822
  set_seed(42)
723
823
 
724
824
  revert_kwargs = {"model_config": MINI_MODEL_SETUPS[model_name]}
725
- if "mllama" in model_name:
825
+ if "mllama" in model_name or "llama4" in model_name:
726
826
  revert_kwargs["model_type"] = "conditional_generation"
727
827
 
728
828
  if with_liger is True:
@@ -731,7 +831,8 @@ def run_mini_model_multimodal(
731
831
  "rms_norm": True,
732
832
  "cross_entropy": False,
733
833
  }
734
-
834
+ if "llama4" in model_name:
835
+ kwargs["rope"] = False
735
836
  if "qwen2_5_vl" not in model_name and "llava" not in model_name:
736
837
  kwargs["layer_norm"] = True
737
838
 
@@ -856,6 +957,25 @@ def run_mini_model_multimodal(
856
957
  ),
857
958
  ],
858
959
  ),
960
+ pytest.param(
961
+ "mini_llama4",
962
+ 32,
963
+ 1e-4,
964
+ torch.bfloat16,
965
+ 1e-1,
966
+ 1e-1,
967
+ 0.2,
968
+ 0.3,
969
+ 1e-2,
970
+ 1e-2,
971
+ marks=[
972
+ pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
973
+ pytest.mark.skipif(
974
+ not LLAMA4_AVAILABLE,
975
+ reason="Llama4 not available in this version of transformers",
976
+ ),
977
+ ],
978
+ ),
859
979
  pytest.param(
860
980
  "mini_paligemma",
861
981
  32,