liger-kernel-nightly 0.5.5.dev20250318183047__tar.gz → 0.5.5.dev20250320214749__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (232) hide show
  1. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/PKG-INFO +1 -1
  2. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/pyproject.toml +1 -1
  3. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/paligemma.py +184 -0
  4. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/monkey_patch.py +39 -13
  5. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel_nightly.egg-info/PKG-INFO +1 -1
  6. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/bf16/test_mini_models_multimodal.py +75 -3
  7. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/fp32/test_mini_models_multimodal.py +74 -3
  8. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/utils.py +3 -1
  9. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  10. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  11. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/pull_request_template.md +0 -0
  12. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/workflows/amd-ci.yml +0 -0
  13. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/workflows/docs.yml +0 -0
  14. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/workflows/intel-ci.yml +0 -0
  15. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/workflows/nvi-ci.yml +0 -0
  16. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/workflows/publish-nightly.yml +0 -0
  17. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/workflows/publish-release.yml +0 -0
  18. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/.gitignore +0 -0
  19. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/LICENSE +0 -0
  20. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/Makefile +0 -0
  21. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/NOTICE +0 -0
  22. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/README.md +0 -0
  23. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/README.md +0 -0
  24. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/__init__.py +0 -0
  25. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/benchmarks_visualizer.py +0 -0
  26. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/data/all_benchmark_data.csv +0 -0
  27. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/__init__.py +0 -0
  28. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
  29. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
  30. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
  31. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
  32. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_embedding.py +0 -0
  33. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
  34. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
  35. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_geglu.py +0 -0
  36. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_group_norm.py +0 -0
  37. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_jsd.py +0 -0
  38. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_kl_div.py +0 -0
  39. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_kto_loss.py +0 -0
  40. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_layer_norm.py +0 -0
  41. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
  42. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
  43. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_rms_norm.py +0 -0
  44. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_rope.py +0 -0
  45. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
  46. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_swiglu.py +0 -0
  47. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_tvd.py +0 -0
  48. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/utils.py +0 -0
  49. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/dev/fmt-requirements.txt +0 -0
  50. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/dev/modal/tests.py +0 -0
  51. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/dev/modal/tests_bwd.py +0 -0
  52. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/Examples.md +0 -0
  53. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/Getting-Started.md +0 -0
  54. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/High-Level-APIs.md +0 -0
  55. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/Low-Level-APIs.md +0 -0
  56. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/acknowledgement.md +0 -0
  57. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/contributing.md +0 -0
  58. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/images/banner.GIF +0 -0
  59. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/images/compose.gif +0 -0
  60. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/images/e2e-memory.png +0 -0
  61. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/images/e2e-tps.png +0 -0
  62. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/images/logo-banner.png +0 -0
  63. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/images/patch.gif +0 -0
  64. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/images/post-training.png +0 -0
  65. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/index.md +0 -0
  66. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/license.md +0 -0
  67. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/alignment/accelerate_config.yaml +0 -0
  68. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/alignment/run_orpo.py +0 -0
  69. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/README.md +0 -0
  70. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/callback.py +0 -0
  71. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/config/fsdp_config.json +0 -0
  72. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/img/gemma_7b_mem.png +0 -0
  73. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/img/gemma_7b_tp.png +0 -0
  74. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/img/llama_mem_alloc.png +0 -0
  75. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/img/llama_tps.png +0 -0
  76. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
  77. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/img/qwen_tps.png +0 -0
  78. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/launch_on_modal.py +0 -0
  79. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/requirements.txt +0 -0
  80. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/run_benchmarks.sh +0 -0
  81. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/run_gemma.sh +0 -0
  82. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/run_llama.sh +0 -0
  83. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/run_qwen.sh +0 -0
  84. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/run_qwen2_vl.sh +0 -0
  85. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/training.py +0 -0
  86. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/training_multimodal.py +0 -0
  87. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/lightning/README.md +0 -0
  88. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/lightning/requirements.txt +0 -0
  89. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/lightning/training.py +0 -0
  90. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/README.md +0 -0
  91. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/callback.py +0 -0
  92. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
  93. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
  94. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
  95. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
  96. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
  97. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
  98. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
  99. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
  100. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
  101. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/medusa_util.py +0 -0
  102. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/requirements.txt +0 -0
  103. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
  104. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/train.py +0 -0
  105. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/licenses/LICENSE-Apache-2.0 +0 -0
  106. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/licenses/LICENSE-MIT-AutoAWQ +0 -0
  107. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
  108. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/licenses/LICENSE-MIT-llmc +0 -0
  109. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/licenses/LICENSE-MIT-triton +0 -0
  110. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/mkdocs.yml +0 -0
  111. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/setup.cfg +0 -0
  112. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/setup.py +0 -0
  113. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/__init__.py +0 -0
  114. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/README.md +0 -0
  115. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/__init__.py +0 -0
  116. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
  117. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
  118. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/functional.py +0 -0
  119. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
  120. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
  121. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/fused_linear_rlhf.py +0 -0
  122. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
  123. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
  124. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
  125. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
  126. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
  127. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
  128. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/env_report.py +0 -0
  129. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/__init__.py +0 -0
  130. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/cross_entropy.py +0 -0
  131. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  132. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
  133. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
  134. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
  135. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/geglu.py +0 -0
  136. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/group_norm.py +0 -0
  137. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/jsd.py +0 -0
  138. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/kl_div.py +0 -0
  139. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/layer_norm.py +0 -0
  140. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
  141. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/rms_norm.py +0 -0
  142. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/rope.py +0 -0
  143. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/swiglu.py +0 -0
  144. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/tvd.py +0 -0
  145. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/utils.py +0 -0
  146. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/__init__.py +0 -0
  147. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/auto_model.py +0 -0
  148. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/cross_entropy.py +0 -0
  149. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  150. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/functional.py +0 -0
  151. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
  152. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
  153. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/geglu.py +0 -0
  154. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/group_norm.py +0 -0
  155. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/jsd.py +0 -0
  156. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/kl_div.py +0 -0
  157. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/layer_norm.py +0 -0
  158. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/__init__.py +0 -0
  159. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/gemma.py +0 -0
  160. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/gemma2.py +0 -0
  161. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/llama.py +0 -0
  162. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
  163. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/mistral.py +0 -0
  164. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/mixtral.py +0 -0
  165. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/mllama.py +0 -0
  166. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/olmo2.py +0 -0
  167. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/phi3.py +0 -0
  168. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/qwen2.py +0 -0
  169. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
  170. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
  171. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
  172. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/rms_norm.py +0 -0
  173. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/rope.py +0 -0
  174. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/swiglu.py +0 -0
  175. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
  176. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
  177. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  178. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/tvd.py +0 -0
  179. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/triton/__init__.py +0 -0
  180. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/triton/monkey_patch.py +0 -0
  181. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/utils.py +0 -0
  182. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel_nightly.egg-info/SOURCES.txt +0 -0
  183. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
  184. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
  185. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
  186. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/__init__.py +0 -0
  187. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/__init__.py +0 -0
  188. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/test_cpo_loss.py +0 -0
  189. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/test_dpo_loss.py +0 -0
  190. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/test_grpo_loss.py +0 -0
  191. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/test_jsd_loss.py +0 -0
  192. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/test_kto_loss.py +0 -0
  193. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/test_orpo_loss.py +0 -0
  194. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/test_simpo_loss.py +0 -0
  195. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/conftest.py +0 -0
  196. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/__init__.py +0 -0
  197. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/bf16/__init__.py +0 -0
  198. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/bf16/test_mini_models.py +0 -0
  199. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/bf16/test_mini_models_with_logits.py +0 -0
  200. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/fp32/__init__.py +0 -0
  201. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/fp32/test_mini_models.py +0 -0
  202. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/fp32/test_mini_models_with_logits.py +0 -0
  203. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
  204. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
  205. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
  206. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
  207. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
  208. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/tiny_shakespeare.txt +0 -0
  209. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
  210. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
  211. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
  212. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_auto_model.py +0 -0
  213. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_cross_entropy.py +0 -0
  214. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_embedding.py +0 -0
  215. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_flex_attention.py +0 -0
  216. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
  217. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_fused_linear_jsd.py +0 -0
  218. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_geglu.py +0 -0
  219. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_group_norm.py +0 -0
  220. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_jsd.py +0 -0
  221. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_kl_div.py +0 -0
  222. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_layer_norm.py +0 -0
  223. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_mm_int8int2.py +0 -0
  224. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_monkey_patch.py +0 -0
  225. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_qwen2vl_mrope.py +0 -0
  226. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_rms_norm.py +0 -0
  227. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_rope.py +0 -0
  228. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_swiglu.py +0 -0
  229. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_trainer_integration.py +0 -0
  230. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_transformers.py +0 -0
  231. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_tvd.py +0 -0
  232. {liger_kernel_nightly-0.5.5.dev20250318183047 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/triton/test_triton_monkey_patch.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.5.5.dev20250318183047
3
+ Version: 0.5.5.dev20250320214749
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "liger_kernel_nightly"
7
- version = "0.5.5.dev20250318183047"
7
+ version = "0.5.5.dev20250320214749"
8
8
  description = "Efficient Triton kernels for LLM Training"
9
9
  urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -21,6 +21,190 @@ from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinea
21
21
  logger = logging.get_logger(__name__)
22
22
 
23
23
 
24
+ @add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING)
25
+ @replace_return_docstrings(output_type=PaliGemmaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
26
+ def lce_forward_deprecated(
27
+ self,
28
+ input_ids: torch.LongTensor = None,
29
+ pixel_values: torch.FloatTensor = None,
30
+ attention_mask: Optional[torch.Tensor] = None,
31
+ position_ids: Optional[torch.LongTensor] = None,
32
+ past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
33
+ token_type_ids: Optional[torch.LongTensor] = None,
34
+ cache_position: Optional[torch.LongTensor] = None,
35
+ inputs_embeds: Optional[torch.FloatTensor] = None,
36
+ labels: Optional[torch.LongTensor] = None,
37
+ use_cache: Optional[bool] = None,
38
+ output_attentions: Optional[bool] = None,
39
+ output_hidden_states: Optional[bool] = None,
40
+ return_dict: Optional[bool] = None,
41
+ ) -> Union[Tuple, PaliGemmaCausalLMOutputWithPast]:
42
+ r"""
43
+ Args:
44
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
45
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
46
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
47
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
48
+
49
+ Returns:
50
+
51
+ Example:
52
+
53
+ ```python
54
+ >>> from PIL import Image
55
+ >>> import requests
56
+ >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
57
+
58
+ >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/PaliGemma-test-224px-hf")
59
+ >>> processor = AutoProcessor.from_pretrained("google/PaliGemma-test-224px-hf")
60
+
61
+ >>> prompt = "answer en Where is the cow standing?"
62
+ >>> url = "https://huggingface.co/gv-hf/PaliGemma-test-224px-hf/resolve/main/cow_beach_1.png"
63
+ >>> image = Image.open(requests.get(url, stream=True).raw)
64
+
65
+ >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
66
+
67
+ >>> # Generate
68
+ >>> generate_ids = model.generate(**inputs, max_length=30)
69
+ >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
70
+ "answer en Where is the cow standing?\nbeach"
71
+ ```"""
72
+
73
+ if (input_ids is None) ^ (inputs_embeds is not None):
74
+ raise ValueError(
75
+ "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
76
+ )
77
+
78
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
79
+ output_hidden_states = (
80
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
81
+ )
82
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
83
+
84
+ # the attention mask is turned 4d after, we keep track of the original one
85
+ input_attention_mask = attention_mask
86
+
87
+ if inputs_embeds is None:
88
+ # 1. Extra the input embeddings
89
+ inputs_embeds = self.get_input_embeddings()(input_ids)
90
+
91
+ # 2. Merge text and images
92
+ if pixel_values is not None and input_ids.shape[1] != 1:
93
+ image_outputs = self.vision_tower(pixel_values.to(inputs_embeds.dtype))
94
+ selected_image_feature = image_outputs.last_hidden_state
95
+ image_features = self.multi_modal_projector(selected_image_feature)
96
+
97
+ if cache_position is None:
98
+ cache_position = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device)
99
+ inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
100
+ image_features, inputs_embeds, input_ids, attention_mask, labels, token_type_ids, cache_position
101
+ )
102
+
103
+ else:
104
+ # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
105
+ # generation with cache
106
+ if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
107
+ # Retrieve the first layer to inspect the logits and mask out the hidden states
108
+ # that are set to 0
109
+ # TODO @molbap this will only work for dynamic cache.
110
+ first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
111
+
112
+ # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
113
+ batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
114
+
115
+ # Get the target length
116
+ target_seqlen = cache_position[-1] + 1
117
+ extended_attention_mask = torch.ones(
118
+ (attention_mask.shape[0], target_seqlen - attention_mask.shape[1] + 1),
119
+ dtype=attention_mask.dtype,
120
+ device=attention_mask.device,
121
+ )
122
+ # Filter out only the tokens that can be un-attended, this can happen
123
+ # if one uses PaliGemma+ Fused modules where the cache on the
124
+ # first iteration is already big enough, or if one passes custom cache
125
+ valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
126
+ new_batch_index = batch_index[valid_indices]
127
+ new_non_attended_tokens = non_attended_tokens[valid_indices]
128
+
129
+ # Zero-out the places where we don't need to attend
130
+ extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
131
+
132
+ attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
133
+ position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
134
+
135
+ attention_mask = attention_mask.to(inputs_embeds.dtype)
136
+ outputs = self.language_model.model(
137
+ attention_mask=attention_mask,
138
+ position_ids=position_ids,
139
+ past_key_values=past_key_values,
140
+ inputs_embeds=inputs_embeds,
141
+ use_cache=use_cache,
142
+ output_attentions=output_attentions,
143
+ output_hidden_states=output_hidden_states,
144
+ return_dict=return_dict,
145
+ cache_position=cache_position,
146
+ )
147
+
148
+ hidden_states = outputs[0]
149
+
150
+ loss = None
151
+ logits = None
152
+
153
+ if self.training and (labels is not None):
154
+ shift_hidden_states = hidden_states[..., :-1, :]
155
+ shift_labels = labels[..., 1:]
156
+
157
+ hidden_device = shift_hidden_states.device
158
+
159
+ if attention_mask is not None:
160
+ # we use the input attention mask to shift the hidden_states and labels, because it is 2D.
161
+ # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
162
+ shift_attention_mask = attention_mask[:, -shift_hidden_states.shape[1] :].to(hidden_device)
163
+ shift_hidden_states = shift_hidden_states[shift_attention_mask.to(hidden_device) != 0].contiguous()
164
+ shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
165
+ else:
166
+ shift_hidden_states = shift_hidden_states.contiguous()
167
+ shift_labels = shift_labels.contiguous()
168
+
169
+ # Flatten hidden state
170
+ shift_hidden_states = shift_hidden_states.view(-1, self.config.text_config.hidden_size)
171
+ shift_labels = shift_labels.view(-1).to(hidden_device)
172
+
173
+ lce = LigerFusedLinearCrossEntropyLoss()
174
+ loss = lce(self.language_model.lm_head.weight, shift_hidden_states, shift_labels)
175
+
176
+ else:
177
+ logits = self.language_model.lm_head(hidden_states)
178
+ if labels is not None:
179
+ shift_logits = logits[..., :-1, :]
180
+ shift_labels = labels[..., 1:]
181
+ if input_attention_mask is not None:
182
+ # we use the input attention mask to shift the logits and labels, because it is 2D.
183
+ shift_attention_mask = input_attention_mask[..., 1:]
184
+ shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
185
+ shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
186
+ else:
187
+ shift_logits = shift_logits.contiguous()
188
+ shift_labels = shift_labels.contiguous()
189
+ # Flatten the tokens
190
+ loss_fct = CrossEntropyLoss()
191
+
192
+ flat_logits = shift_logits.view(-1, self.config.vocab_size)
193
+ flat_labels = shift_labels.view(-1).to(shift_logits.device)
194
+ loss = loss_fct(flat_logits, flat_labels)
195
+ if not return_dict:
196
+ output = (logits,) + outputs[1:]
197
+ return (loss,) + output if loss is not None else output
198
+
199
+ return PaliGemmaCausalLMOutputWithPast(
200
+ loss=loss,
201
+ logits=logits,
202
+ past_key_values=outputs.past_key_values,
203
+ hidden_states=outputs.hidden_states,
204
+ attentions=outputs.attentions,
205
+ )
206
+
207
+
24
208
  @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
25
209
  @add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING)
26
210
  @replace_return_docstrings(output_type=PaliGemmaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
@@ -631,6 +631,7 @@ def apply_liger_kernel_to_paligemma(
631
631
 
632
632
  # PaliGemma submodules are ['vision_tower', 'multi_modal_projector', 'language_model']
633
633
 
634
+ from transformers.models.gemma.modeling_gemma import GemmaForCausalLM
634
635
  from transformers.models.gemma2.modeling_gemma2 import Gemma2ForCausalLM
635
636
  from transformers.models.paligemma import modeling_paligemma
636
637
  from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
@@ -639,6 +640,7 @@ def apply_liger_kernel_to_paligemma(
639
640
  from transformers.models.siglip.modeling_siglip import SiglipVisionModel
640
641
 
641
642
  from liger_kernel.transformers.model.paligemma import lce_forward
643
+ from liger_kernel.transformers.model.paligemma import lce_forward_deprecated
642
644
 
643
645
  # The vision_tower is a SiglipVisionModel
644
646
  if layer_norm:
@@ -647,13 +649,22 @@ def apply_liger_kernel_to_paligemma(
647
649
  # SiglipMLP is standard FFN so LigerGEGLUMLP is not compatible
648
650
  # The multi_modal_projector is Linear, nothing to do
649
651
 
650
- # The language_model is Gemma2ForCausalLM
651
- apply_liger_kernel_to_gemma2(rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, geglu=geglu)
652
+ # The language_model is GemmaForCausalLM or Gemma2ForCausalLM
653
+ apply_liger_kernel_to_gemma(
654
+ rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, rms_norm=rms_norm, geglu=geglu
655
+ )
656
+ apply_liger_kernel_to_gemma2(
657
+ rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, rms_norm=rms_norm, geglu=geglu
658
+ )
652
659
  # Handle loss function
653
660
  if cross_entropy:
654
661
  modeling_paligemma.nn.CrossEntropyLoss = LigerCrossEntropyLoss
655
662
  if fused_linear_cross_entropy:
656
- modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward
663
+ if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
664
+ modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward
665
+ else: # if version < 4.46.1
666
+ logger.warning(TRANSFORMER_DEPRECATION_WARNING)
667
+ modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward_deprecated
657
668
 
658
669
  if model is not None:
659
670
  # The model instance already exists, so we need to additionally patch the
@@ -672,16 +683,31 @@ def apply_liger_kernel_to_paligemma(
672
683
  _patch_layer_norm_module(layer.layer_norm1)
673
684
  _patch_layer_norm_module(layer.layer_norm2)
674
685
 
675
- language_model: Gemma2ForCausalLM = model.language_model
676
-
677
- apply_liger_kernel_to_gemma2(
678
- rope=rope,
679
- cross_entropy=False,
680
- fused_linear_cross_entropy=False,
681
- rms_norm=rms_norm,
682
- geglu=geglu,
683
- model=language_model,
684
- )
686
+ language_model = model.language_model
687
+
688
+ if isinstance(language_model, GemmaForCausalLM):
689
+ apply_liger_kernel_to_gemma(
690
+ rope=rope,
691
+ cross_entropy=False,
692
+ fused_linear_cross_entropy=False,
693
+ rms_norm=rms_norm,
694
+ geglu=geglu,
695
+ model=language_model,
696
+ )
697
+
698
+ elif isinstance(language_model, Gemma2ForCausalLM):
699
+ apply_liger_kernel_to_gemma2(
700
+ rope=rope,
701
+ cross_entropy=False,
702
+ fused_linear_cross_entropy=False,
703
+ rms_norm=rms_norm,
704
+ geglu=geglu,
705
+ model=language_model,
706
+ )
707
+ else:
708
+ raise TypeError(
709
+ "The language_model of a PaliGemma model must be either GemmaForCausalLM or Gemma2ForCausalLM."
710
+ )
685
711
 
686
712
 
687
713
  def apply_liger_kernel_to_qwen2(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.5.5.dev20250318183047
3
+ Version: 0.5.5.dev20250320214749
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -64,6 +64,10 @@ except ImportError:
64
64
  MLLAMA_AVAILABLE = False
65
65
 
66
66
  try:
67
+ import transformers
68
+
69
+ from packaging import version
70
+ from transformers.models.gemma.configuration_gemma import GemmaConfig
67
71
  from transformers.models.gemma.tokenization_gemma_fast import GemmaTokenizerFast
68
72
  from transformers.models.gemma2.configuration_gemma2 import Gemma2Config
69
73
  from transformers.models.paligemma.configuration_paligemma import PaliGemmaConfig
@@ -72,7 +76,7 @@ try:
72
76
  from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
73
77
  from transformers.models.siglip.image_processing_siglip import SiglipImageProcessor
74
78
 
75
- PALIGEMMA_AVAILABLE = True
79
+ PALIGEMMA_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.46.0")
76
80
  except ImportError:
77
81
  PALIGEMMA_AVAILABLE = False
78
82
 
@@ -152,6 +156,55 @@ if MLLAMA_AVAILABLE:
152
156
 
153
157
  if PALIGEMMA_AVAILABLE:
154
158
  MINI_MODEL_SETUPS["mini_paligemma"] = MiniModelConfig(
159
+ liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_paligemma, fused_linear_cross_entropy=False),
160
+ liger_kernel_patch_revert_func=revert_liger_kernel_to_Paligemma,
161
+ model_class=PaliGemmaForConditionalGeneration,
162
+ mini_model_config=PaliGemmaConfig(
163
+ vision_config=SiglipVisionConfig(
164
+ attention_dropout=0.0,
165
+ hidden_act="gelu_pytorch_tanh",
166
+ hidden_size=1152,
167
+ image_size=224,
168
+ intermediate_size=2048, # 4304
169
+ layer_norm_eps=1e-06,
170
+ num_attention_heads=4, # 16
171
+ num_channels=3,
172
+ num_hidden_layers=4, # 27
173
+ num_image_tokens=256,
174
+ num_positions=256,
175
+ patch_size=14,
176
+ projection_dim=1024, # 2304
177
+ ),
178
+ text_config=GemmaConfig(
179
+ vocab_size=32000, # 256000
180
+ hidden_size=1024, # 3072
181
+ intermediate_size=2048, # 24576
182
+ num_hidden_layers=4, # 28
183
+ num_attention_heads=4, # 16
184
+ num_key_value_heads=4, # 16
185
+ head_dim=256,
186
+ hidden_activation="gelu_pytorch_tanh",
187
+ max_position_embeddings=8192,
188
+ initializer_range=0.02,
189
+ rms_norm_eps=1e-06,
190
+ use_cache=True,
191
+ pad_token_id=0,
192
+ # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
193
+ # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
194
+ bos_token_id=1, # 128000
195
+ eos_token_id=2, # 128001
196
+ tie_word_embeddings=True,
197
+ rope_theta=10000.0,
198
+ attention_bias=False,
199
+ attention_dropout=0.0,
200
+ ),
201
+ image_token_index=4, # NOTE: outside the vocab size
202
+ attn_implementation="eager",
203
+ vocab_size=32000,
204
+ projection_dim=1024,
205
+ ),
206
+ )
207
+ MINI_MODEL_SETUPS["mini_paligemma2"] = MiniModelConfig(
155
208
  liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_paligemma, fused_linear_cross_entropy=False),
156
209
  liger_kernel_patch_revert_func=revert_liger_kernel_to_Paligemma,
157
210
  model_class=PaliGemmaForConditionalGeneration,
@@ -297,7 +350,7 @@ if QWEN2_5_VL_AVAILABLE:
297
350
  )
298
351
 
299
352
 
300
- def create_processor(model_name):
353
+ def create_processor(model_name: str):
301
354
  if model_name == "mini_qwen2_vl":
302
355
  tokenizer_config = load_tokenizer_config(
303
356
  os.path.join(FAKE_CONFIGS_PATH, "Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json")
@@ -352,7 +405,7 @@ def create_processor(model_name):
352
405
  image_processor = MllamaImageProcessor(size={"height": 560, "width": 560})
353
406
  return MllamaProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
354
407
 
355
- elif model_name == "mini_paligemma":
408
+ elif model_name.startswith("mini_paligemma"):
356
409
  tokenizer_config = load_tokenizer_config(
357
410
  os.path.join(
358
411
  FAKE_CONFIGS_PATH,
@@ -580,6 +633,25 @@ def run_mini_model_multimodal(
580
633
  ),
581
634
  ],
582
635
  ),
636
+ pytest.param(
637
+ "mini_paligemma2",
638
+ 32,
639
+ 1e-4,
640
+ torch.bfloat16,
641
+ 1e-3,
642
+ 1e-2,
643
+ 1e-1,
644
+ 1e-2,
645
+ 1e-2,
646
+ 1e-2,
647
+ marks=[
648
+ pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
649
+ pytest.mark.skipif(
650
+ not PALIGEMMA_AVAILABLE,
651
+ reason="Paligemma2 not available in this version of transformers",
652
+ ),
653
+ ],
654
+ ),
583
655
  ],
584
656
  )
585
657
  def test_mini_model_multimodal(
@@ -63,6 +63,10 @@ except ImportError:
63
63
  MLLAMA_AVAILABLE = False
64
64
 
65
65
  try:
66
+ import transformers
67
+
68
+ from packaging import version
69
+ from transformers.models.gemma.configuration_gemma import GemmaConfig
66
70
  from transformers.models.gemma.tokenization_gemma_fast import GemmaTokenizerFast
67
71
  from transformers.models.gemma2.configuration_gemma2 import Gemma2Config
68
72
  from transformers.models.paligemma.configuration_paligemma import PaliGemmaConfig
@@ -71,7 +75,7 @@ try:
71
75
  from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
72
76
  from transformers.models.siglip.image_processing_siglip import SiglipImageProcessor
73
77
 
74
- PALIGEMMA_AVAILABLE = True
78
+ PALIGEMMA_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.46.0")
75
79
  except ImportError:
76
80
  PALIGEMMA_AVAILABLE = False
77
81
 
@@ -151,6 +155,56 @@ if MLLAMA_AVAILABLE:
151
155
 
152
156
  if PALIGEMMA_AVAILABLE:
153
157
  MINI_MODEL_SETUPS["mini_paligemma"] = MiniModelConfig(
158
+ liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_paligemma, fused_linear_cross_entropy=False),
159
+ liger_kernel_patch_revert_func=revert_liger_kernel_to_Paligemma,
160
+ model_class=PaliGemmaForConditionalGeneration,
161
+ mini_model_config=PaliGemmaConfig(
162
+ vision_config=SiglipVisionConfig(
163
+ attention_dropout=0.0,
164
+ hidden_act="gelu_pytorch_tanh",
165
+ hidden_size=1152,
166
+ image_size=224,
167
+ intermediate_size=2048, # 4304
168
+ layer_norm_eps=1e-06,
169
+ num_attention_heads=4, # 16
170
+ num_channels=3,
171
+ num_hidden_layers=4, # 27
172
+ num_image_tokens=256,
173
+ num_positions=256,
174
+ patch_size=14,
175
+ projection_dim=1024, # 2304
176
+ ),
177
+ text_config=GemmaConfig(
178
+ vocab_size=32000, # 256000
179
+ hidden_size=1024, # 3072
180
+ intermediate_size=2048, # 24576
181
+ num_hidden_layers=4, # 28
182
+ num_attention_heads=4, # 16
183
+ num_key_value_heads=4, # 16
184
+ head_dim=256,
185
+ hidden_activation="gelu_pytorch_tanh",
186
+ max_position_embeddings=8192,
187
+ initializer_range=0.02,
188
+ rms_norm_eps=1e-06,
189
+ use_cache=True,
190
+ pad_token_id=0,
191
+ # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
192
+ # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
193
+ bos_token_id=1, # 128000
194
+ eos_token_id=2, # 128001
195
+ tie_word_embeddings=True,
196
+ rope_theta=10000.0,
197
+ attention_bias=False,
198
+ attention_dropout=0.0,
199
+ ),
200
+ image_token_index=4, # NOTE: outside the vocab size
201
+ attn_implementation="eager",
202
+ vocab_size=32000,
203
+ projection_dim=1024,
204
+ ),
205
+ )
206
+
207
+ MINI_MODEL_SETUPS["mini_paligemma2"] = MiniModelConfig(
154
208
  liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_paligemma, fused_linear_cross_entropy=False),
155
209
  liger_kernel_patch_revert_func=revert_liger_kernel_to_Paligemma,
156
210
  model_class=PaliGemmaForConditionalGeneration,
@@ -200,6 +254,7 @@ if PALIGEMMA_AVAILABLE:
200
254
  ),
201
255
  )
202
256
 
257
+
203
258
  if QWEN2_VL_AVAILABLE:
204
259
  MINI_MODEL_SETUPS["mini_qwen2_vl"] = MiniModelConfig(
205
260
  liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_qwen2_vl, fused_linear_cross_entropy=False),
@@ -295,7 +350,7 @@ if QWEN2_5_VL_AVAILABLE:
295
350
  )
296
351
 
297
352
 
298
- def create_processor(model_name):
353
+ def create_processor(model_name: str):
299
354
  if model_name == "mini_qwen2_vl":
300
355
  tokenizer_config = load_tokenizer_config(
301
356
  os.path.join(FAKE_CONFIGS_PATH, "Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json")
@@ -350,7 +405,7 @@ def create_processor(model_name):
350
405
  image_processor = MllamaImageProcessor(size={"height": 560, "width": 560})
351
406
  return MllamaProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
352
407
 
353
- elif model_name == "mini_paligemma":
408
+ elif model_name.startswith("mini_paligemma"):
354
409
  tokenizer_config = load_tokenizer_config(
355
410
  os.path.join(
356
411
  FAKE_CONFIGS_PATH,
@@ -569,6 +624,22 @@ def run_mini_model_multimodal(
569
624
  reason="Paligemma not available in this version of transformers",
570
625
  ),
571
626
  ),
627
+ pytest.param(
628
+ "mini_paligemma2",
629
+ 32,
630
+ 1e-4,
631
+ torch.float32,
632
+ 1e-8,
633
+ 1e-5,
634
+ 5e-3,
635
+ 1e-5,
636
+ 5e-3,
637
+ 1e-5,
638
+ marks=pytest.mark.skipif(
639
+ not PALIGEMMA_AVAILABLE,
640
+ reason="Paligemma2 not available in this version of transformers",
641
+ ),
642
+ ),
572
643
  ],
573
644
  )
574
645
  def test_mini_model_multimodal(
@@ -314,13 +314,15 @@ def revert_liger_kernel_to_Paligemma(model_config: MiniModelConfig):
314
314
  Revert all Liger kernel patches applied to Gemma2.
315
315
  """
316
316
 
317
+ from transformers.models.gemma import modeling_gemma
317
318
  from transformers.models.gemma2 import modeling_gemma2
318
319
  from transformers.models.paligemma import modeling_paligemma
319
320
  from transformers.models.siglip import modeling_siglip
320
321
 
321
- importlib.reload(modeling_siglip)
322
+ importlib.reload(modeling_gemma)
322
323
  importlib.reload(modeling_gemma2)
323
324
  importlib.reload(modeling_paligemma)
325
+ importlib.reload(modeling_siglip)
324
326
  model_config.model_class = modeling_paligemma.PaliGemmaForConditionalGeneration
325
327
  print("Liger kernel patches have been reverted.")
326
328