liger-kernel-nightly 0.5.10.dev20250613192702__tar.gz → 0.5.10.dev20250613212111__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (269) hide show
  1. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/PKG-INFO +1 -1
  2. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/dev/modal/benchmarks.py +8 -8
  3. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/pyproject.toml +1 -1
  4. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel_nightly.egg-info/PKG-INFO +1 -1
  5. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/convergence/bf16/test_mini_models.py +20 -18
  6. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/convergence/bf16/test_mini_models_multimodal.py +16 -10
  7. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/convergence/bf16/test_mini_models_with_logits.py +18 -11
  8. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/convergence/fp32/test_mini_models.py +14 -12
  9. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/convergence/fp32/test_mini_models_multimodal.py +19 -12
  10. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/convergence/fp32/test_mini_models_with_logits.py +15 -9
  11. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/utils.py +11 -0
  12. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  13. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  14. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/.github/pull_request_template.md +0 -0
  15. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/.github/workflows/amd-ci.yml +0 -0
  16. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/.github/workflows/benchmark.yml +0 -0
  17. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/.github/workflows/docs.yml +0 -0
  18. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/.github/workflows/intel-ci.yml +0 -0
  19. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/.github/workflows/nvi-ci.yml +0 -0
  20. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/.github/workflows/publish-nightly.yml +0 -0
  21. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/.github/workflows/publish-release.yml +0 -0
  22. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/.gitignore +0 -0
  23. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/.idea/workspace.xml +0 -0
  24. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/LICENSE +0 -0
  25. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/Makefile +0 -0
  26. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/NOTICE +0 -0
  27. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/README.md +0 -0
  28. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/README.md +0 -0
  29. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/__init__.py +0 -0
  30. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/benchmarks_visualizer.py +0 -0
  31. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/data/all_benchmark_data.csv +0 -0
  32. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/__init__.py +0 -0
  33. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
  34. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
  35. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
  36. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
  37. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_dyt.py +0 -0
  38. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_embedding.py +0 -0
  39. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
  40. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
  41. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
  42. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_geglu.py +0 -0
  43. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_group_norm.py +0 -0
  44. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_jsd.py +0 -0
  45. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_kl_div.py +0 -0
  46. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_kto_loss.py +0 -0
  47. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_layer_norm.py +0 -0
  48. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
  49. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
  50. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
  51. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_rms_norm.py +0 -0
  52. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_rope.py +0 -0
  53. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
  54. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_softmax.py +0 -0
  55. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
  56. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_sparsemax.py +0 -0
  57. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_swiglu.py +0 -0
  58. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/benchmark_tvd.py +0 -0
  59. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/benchmark/scripts/utils.py +0 -0
  60. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/dev/fmt-requirements.txt +0 -0
  61. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/dev/modal/tests.py +0 -0
  62. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/dev/modal/tests_bwd.py +0 -0
  63. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/Examples.md +0 -0
  64. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/Getting-Started.md +0 -0
  65. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/High-Level-APIs.md +0 -0
  66. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/Low-Level-APIs.md +0 -0
  67. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/acknowledgement.md +0 -0
  68. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/contributing.md +0 -0
  69. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/images/banner.GIF +0 -0
  70. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/images/compose.gif +0 -0
  71. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/images/e2e-memory.png +0 -0
  72. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/images/e2e-tps.png +0 -0
  73. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/images/logo-banner.png +0 -0
  74. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/images/patch.gif +0 -0
  75. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/images/post-training.png +0 -0
  76. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/index.md +0 -0
  77. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/docs/license.md +0 -0
  78. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/alignment/accelerate_config.yaml +0 -0
  79. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/alignment/run_orpo.py +0 -0
  80. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/README.md +0 -0
  81. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/callback.py +0 -0
  82. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/config/fsdp_config.json +0 -0
  83. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/img/gemma_7b_mem.png +0 -0
  84. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/img/gemma_7b_tp.png +0 -0
  85. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/img/llama_mem_alloc.png +0 -0
  86. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/img/llama_tps.png +0 -0
  87. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
  88. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/img/qwen_tps.png +0 -0
  89. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/launch_on_modal.py +0 -0
  90. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/requirements.txt +0 -0
  91. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/run_benchmarks.sh +0 -0
  92. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/run_gemma.sh +0 -0
  93. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/run_llama.sh +0 -0
  94. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/run_qwen.sh +0 -0
  95. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/run_qwen2_vl.sh +0 -0
  96. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/training.py +0 -0
  97. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/huggingface/training_multimodal.py +0 -0
  98. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/lightning/README.md +0 -0
  99. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/lightning/requirements.txt +0 -0
  100. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/lightning/training.py +0 -0
  101. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/README.md +0 -0
  102. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/callback.py +0 -0
  103. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
  104. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
  105. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
  106. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
  107. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
  108. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
  109. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
  110. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
  111. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
  112. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/medusa_util.py +0 -0
  113. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/requirements.txt +0 -0
  114. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
  115. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/examples/medusa/train.py +0 -0
  116. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/licenses/LICENSE-Apache-2.0 +0 -0
  117. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/licenses/LICENSE-MIT-AutoAWQ +0 -0
  118. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
  119. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/licenses/LICENSE-MIT-llmc +0 -0
  120. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/licenses/LICENSE-MIT-triton +0 -0
  121. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/mkdocs.yml +0 -0
  122. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/setup.cfg +0 -0
  123. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/setup.py +0 -0
  124. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/__init__.py +0 -0
  125. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/chunked_loss/README.md +0 -0
  126. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/chunked_loss/__init__.py +0 -0
  127. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
  128. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
  129. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/chunked_loss/functional.py +0 -0
  130. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
  131. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
  132. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
  133. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
  134. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
  135. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
  136. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
  137. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
  138. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
  139. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/env_report.py +0 -0
  140. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/__init__.py +0 -0
  141. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/cross_entropy.py +0 -0
  142. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/dyt.py +0 -0
  143. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  144. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
  145. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
  146. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
  147. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
  148. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/geglu.py +0 -0
  149. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/group_norm.py +0 -0
  150. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/grpo_loss.py +0 -0
  151. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/jsd.py +0 -0
  152. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/kl_div.py +0 -0
  153. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/layer_norm.py +0 -0
  154. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/multi_token_attention.py +0 -0
  155. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
  156. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/rms_norm.py +0 -0
  157. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/rope.py +0 -0
  158. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/softmax.py +0 -0
  159. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/sparsemax.py +0 -0
  160. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/swiglu.py +0 -0
  161. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/tvd.py +0 -0
  162. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/ops/utils.py +0 -0
  163. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/__init__.py +0 -0
  164. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/auto_model.py +0 -0
  165. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/cross_entropy.py +0 -0
  166. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/dyt.py +0 -0
  167. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  168. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/fsdp.py +0 -0
  169. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/functional.py +0 -0
  170. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
  171. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
  172. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
  173. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/geglu.py +0 -0
  174. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/group_norm.py +0 -0
  175. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/grpo_loss.py +0 -0
  176. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/jsd.py +0 -0
  177. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/kl_div.py +0 -0
  178. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/layer_norm.py +0 -0
  179. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/__init__.py +0 -0
  180. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/gemma.py +0 -0
  181. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/gemma2.py +0 -0
  182. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/gemma3.py +0 -0
  183. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/glm4.py +0 -0
  184. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/llama.py +0 -0
  185. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/llava.py +0 -0
  186. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
  187. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/mistral.py +0 -0
  188. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/mixtral.py +0 -0
  189. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/mllama.py +0 -0
  190. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/olmo2.py +0 -0
  191. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/paligemma.py +0 -0
  192. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/phi3.py +0 -0
  193. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/qwen2.py +0 -0
  194. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
  195. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
  196. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/qwen3.py +0 -0
  197. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
  198. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/monkey_patch.py +0 -0
  199. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
  200. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
  201. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/rms_norm.py +0 -0
  202. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/rope.py +0 -0
  203. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/softmax.py +0 -0
  204. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/sparsemax.py +0 -0
  205. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/swiglu.py +0 -0
  206. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
  207. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
  208. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  209. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/transformers/tvd.py +0 -0
  210. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/triton/__init__.py +0 -0
  211. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/triton/monkey_patch.py +0 -0
  212. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel/utils.py +0 -0
  213. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel_nightly.egg-info/SOURCES.txt +0 -0
  214. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
  215. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
  216. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
  217. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/__init__.py +0 -0
  218. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/chunked_loss/__init__.py +0 -0
  219. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/chunked_loss/test_cpo_loss.py +0 -0
  220. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/chunked_loss/test_dpo_loss.py +0 -0
  221. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/chunked_loss/test_grpo_loss.py +0 -0
  222. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/chunked_loss/test_jsd_loss.py +0 -0
  223. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/chunked_loss/test_kto_loss.py +0 -0
  224. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/chunked_loss/test_orpo_loss.py +0 -0
  225. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/chunked_loss/test_simpo_loss.py +0 -0
  226. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/conftest.py +0 -0
  227. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/convergence/__init__.py +0 -0
  228. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/convergence/bf16/__init__.py +0 -0
  229. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/convergence/fp32/__init__.py +0 -0
  230. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
  231. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
  232. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
  233. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
  234. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
  235. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
  236. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
  237. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
  238. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
  239. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/resources/tiny_shakespeare.txt +0 -0
  240. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
  241. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
  242. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
  243. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_auto_model.py +0 -0
  244. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_cross_entropy.py +0 -0
  245. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_dyt.py +0 -0
  246. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_embedding.py +0 -0
  247. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_flex_attention.py +0 -0
  248. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
  249. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_fused_linear_jsd.py +0 -0
  250. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_fused_neighborhood_attention.py +0 -0
  251. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_geglu.py +0 -0
  252. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_group_norm.py +0 -0
  253. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_grpo_loss.py +0 -0
  254. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_jsd.py +0 -0
  255. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_kl_div.py +0 -0
  256. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_layer_norm.py +0 -0
  257. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_mm_int8int2.py +0 -0
  258. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_monkey_patch.py +0 -0
  259. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_multi_token_attention.py +0 -0
  260. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_qwen2vl_mrope.py +0 -0
  261. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_rms_norm.py +0 -0
  262. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_rope.py +0 -0
  263. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_softmax.py +0 -0
  264. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_sparsemax.py +0 -0
  265. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_swiglu.py +0 -0
  266. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_trainer_integration.py +0 -0
  267. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_transformers.py +0 -0
  268. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/transformers/test_tvd.py +0 -0
  269. {liger_kernel_nightly-0.5.10.dev20250613192702 → liger_kernel_nightly-0.5.10.dev20250613212111}/test/triton/test_triton_monkey_patch.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.5.10.dev20250613192702
3
+ Version: 0.5.10.dev20250613212111
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -16,8 +16,8 @@ repo = image.add_local_dir(ROOT_PATH, remote_path=REMOTE_ROOT_PATH)
16
16
 
17
17
  @app.function(gpu="H100", image=repo, timeout=60 * 45)
18
18
  def liger_benchmarks():
19
- import subprocess
20
19
  import os
20
+ import subprocess
21
21
 
22
22
  subprocess.run(
23
23
  ["uv pip install -e '.[dev]' --system"],
@@ -30,7 +30,7 @@ def liger_benchmarks():
30
30
  file_path = Path(REMOTE_ROOT_PATH) / "benchmark" / "data" / "all_benchmark_data.csv"
31
31
  print(f"Checking if file exists at: {file_path}")
32
32
  print(f"File exists: {os.path.exists(file_path)}")
33
-
33
+
34
34
  if not os.path.exists(file_path):
35
35
  print("Listing directory contents:")
36
36
  data_dir = file_path.parent
@@ -53,21 +53,21 @@ def main():
53
53
  # Run the benchmarks and get the data
54
54
  print("Starting benchmark run...")
55
55
  benchmark_data = liger_benchmarks.remote()
56
-
56
+
57
57
  if not benchmark_data:
58
58
  raise ValueError("No data received from remote function")
59
-
59
+
60
60
  # Save the data locally
61
61
  local_data_path = ROOT_PATH / "benchmark" / "data" / "all_benchmark_data.csv"
62
62
  print(f"Attempting to save data to: {local_data_path}")
63
-
63
+
64
64
  local_data_path.parent.mkdir(parents=True, exist_ok=True)
65
-
65
+
66
66
  with open(local_data_path, "wb") as f:
67
67
  f.write(benchmark_data)
68
-
68
+
69
69
  print(f"Successfully saved {len(benchmark_data)} bytes to: {local_data_path}")
70
-
70
+
71
71
  except Exception as e:
72
72
  print(f"Error occurred: {str(e)}")
73
73
  raise
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "liger_kernel_nightly"
7
- version = "0.5.10.dev20250613192702"
7
+ version = "0.5.10.dev20250613212111"
8
8
  description = "Efficient Triton kernels for LLM Training"
9
9
  urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.5.10.dev20250613192702
3
+ Version: 0.5.10.dev20250613212111
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -38,6 +38,8 @@ from liger_kernel.transformers import apply_liger_kernel_to_qwen3_moe
38
38
  from test.utils import DEFAULT_DATASET_PATH
39
39
  from test.utils import MiniModelConfig
40
40
  from test.utils import assert_verbose_allclose
41
+ from test.utils import get_logprobs
42
+ from test.utils import get_topk
41
43
  from test.utils import revert_liger_kernel_to_gemma
42
44
  from test.utils import revert_liger_kernel_to_gemma2
43
45
  from test.utils import revert_liger_kernel_to_gemma3_text
@@ -851,17 +853,17 @@ def run_mini_model(
851
853
  eval_output = model(**eval_batch)
852
854
  print(f"Eval Loss: {eval_output.loss.item()}")
853
855
  loss_list.append(eval_output.loss.item())
854
-
856
+ topk_logprobs = get_topk(get_logprobs(eval_output.logits))
855
857
  MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
856
858
  return {
857
859
  "loss": loss_list,
858
- "logits": eval_output.logits,
860
+ "topk_logprobs": topk_logprobs.values,
859
861
  "model": model,
860
862
  }
861
863
 
862
864
 
863
865
  @pytest.mark.parametrize(
864
- "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol",
866
+ "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
865
867
  [
866
868
  pytest.param(
867
869
  "mini_llama3",
@@ -884,7 +886,7 @@ def run_mini_model(
884
886
  1e-3,
885
887
  1e-2,
886
888
  1e-1,
887
- 1e-2,
889
+ 1e-1,
888
890
  1e-2,
889
891
  1e-2,
890
892
  marks=[
@@ -902,7 +904,7 @@ def run_mini_model(
902
904
  torch.bfloat16,
903
905
  1e-3,
904
906
  1e-2,
905
- 1, # 1e-1
907
+ 1e-1, # 1e-1
906
908
  1e-1, # 1e-2
907
909
  1e-2,
908
910
  1e-2,
@@ -972,7 +974,7 @@ def run_mini_model(
972
974
  torch.bfloat16,
973
975
  1e-3,
974
976
  1e-2,
975
- 1, # 1e-1
977
+ 1e-1, # 1e-1
976
978
  1e-1, # 1e-2
977
979
  1e-2,
978
980
  1e-2,
@@ -1111,8 +1113,8 @@ def run_mini_model(
1111
1113
  torch.bfloat16,
1112
1114
  1e-3,
1113
1115
  1e-2,
1114
- 1e-1,
1115
1116
  1e-2,
1117
+ 1e-1,
1116
1118
  1e-2,
1117
1119
  1e-2,
1118
1120
  marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
@@ -1124,8 +1126,8 @@ def run_mini_model(
1124
1126
  torch.bfloat16,
1125
1127
  1e-3,
1126
1128
  1e-2,
1127
- 1e-1,
1128
1129
  1e-2,
1130
+ 1e-1,
1129
1131
  1e-2,
1130
1132
  1e-2,
1131
1133
  marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
@@ -1153,8 +1155,8 @@ def run_mini_model(
1153
1155
  torch.bfloat16,
1154
1156
  1e-3,
1155
1157
  1e-2,
1156
- 1e-1,
1157
- 1e-2,
1158
+ 3e-1,
1159
+ 4e-1,
1158
1160
  1e-2,
1159
1161
  1e-2,
1160
1162
  marks=[
@@ -1174,8 +1176,8 @@ def test_mini_model(
1174
1176
  dtype,
1175
1177
  loss_atol,
1176
1178
  loss_rtol,
1177
- logits_atol,
1178
- logits_rtol,
1179
+ logprobs_atol,
1180
+ logprobs_rtol,
1179
1181
  param_atol,
1180
1182
  param_rtol,
1181
1183
  ):
@@ -1193,13 +1195,13 @@ def test_mini_model(
1193
1195
  rtol=loss_rtol,
1194
1196
  )
1195
1197
 
1196
- # Compare the logits from evaluation step
1197
- if expected_output["logits"] is not None and actual_output["logits"] is not None:
1198
+ # Compare the topk logprobs from evaluation step
1199
+ if expected_output["topk_logprobs"] is not None and actual_output["topk_logprobs"] is not None:
1198
1200
  assert_verbose_allclose(
1199
- expected_output["logits"],
1200
- actual_output["logits"],
1201
- atol=logits_atol,
1202
- rtol=logits_rtol,
1201
+ expected_output["topk_logprobs"],
1202
+ actual_output["topk_logprobs"],
1203
+ atol=logprobs_atol,
1204
+ rtol=logprobs_rtol,
1203
1205
  )
1204
1206
 
1205
1207
  # Compare the params from the last step
@@ -20,6 +20,8 @@ from test.utils import FAKE_CONFIGS_PATH
20
20
  from test.utils import UNTOKENIZED_DATASET_PATH
21
21
  from test.utils import MiniModelConfig
22
22
  from test.utils import assert_verbose_allclose
23
+ from test.utils import get_logprobs
24
+ from test.utils import get_topk
23
25
  from test.utils import is_torchvision_available
24
26
  from test.utils import load_image_processing_config
25
27
  from test.utils import load_processor_config
@@ -764,13 +766,17 @@ def run_mini_model_multimodal(
764
766
 
765
767
  print(f"Step {i}, Loss: {output.loss.item()}")
766
768
  loss_list.append(output.loss.item())
767
-
769
+ topk_logprobs = get_topk(get_logprobs(output.logits))
768
770
  MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
769
- return {"loss": loss_list, "logits": output.logits, "model": model}
771
+ return {
772
+ "loss": loss_list,
773
+ "topk_logprobs": topk_logprobs.values,
774
+ "model": model,
775
+ }
770
776
 
771
777
 
772
778
  @pytest.mark.parametrize(
773
- "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol",
779
+ "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
774
780
  [
775
781
  pytest.param(
776
782
  "mini_qwen2_vl",
@@ -917,8 +923,8 @@ def test_mini_model_multimodal(
917
923
  dtype,
918
924
  loss_atol,
919
925
  loss_rtol,
920
- logits_atol,
921
- logits_rtol,
926
+ logprobs_atol,
927
+ logprobs_rtol,
922
928
  param_atol,
923
929
  param_rtol,
924
930
  ):
@@ -937,12 +943,12 @@ def test_mini_model_multimodal(
937
943
  rtol=loss_rtol,
938
944
  )
939
945
 
940
- # Compare the logits from the last step
946
+ # Compare the topk logprobs from evaluation step
941
947
  assert_verbose_allclose(
942
- expected_output["logits"],
943
- actual_output["logits"],
944
- atol=logits_atol,
945
- rtol=logits_rtol,
948
+ expected_output["topk_logprobs"],
949
+ actual_output["topk_logprobs"],
950
+ atol=logprobs_atol,
951
+ rtol=logprobs_rtol,
946
952
  )
947
953
 
948
954
  # Compare the params from the last step
@@ -38,6 +38,8 @@ from liger_kernel.transformers import apply_liger_kernel_to_qwen3_moe
38
38
  from test.utils import DEFAULT_DATASET_PATH
39
39
  from test.utils import MiniModelConfig
40
40
  from test.utils import assert_verbose_allclose
41
+ from test.utils import get_logprobs
42
+ from test.utils import get_topk
41
43
  from test.utils import revert_liger_kernel_to_gemma
42
44
  from test.utils import revert_liger_kernel_to_gemma2
43
45
  from test.utils import revert_liger_kernel_to_gemma3_text
@@ -842,12 +844,17 @@ def run_mini_model(
842
844
  print(f"Step {i}, Loss: {output.loss.item()}")
843
845
  loss_list.append(output.loss.item())
844
846
 
847
+ topk_logprobs = get_topk(get_logprobs(output.logits))
845
848
  MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
846
- return {"loss": loss_list, "logits": output.logits, "model": model}
849
+ return {
850
+ "loss": loss_list,
851
+ "topk_logprobs": topk_logprobs.values,
852
+ "model": model,
853
+ }
847
854
 
848
855
 
849
856
  @pytest.mark.parametrize(
850
- "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol",
857
+ "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
851
858
  [
852
859
  pytest.param(
853
860
  "mini_llama3",
@@ -1058,8 +1065,8 @@ def run_mini_model(
1058
1065
  torch.bfloat16,
1059
1066
  1e-3,
1060
1067
  1e-2,
1061
- 1e-1,
1062
1068
  1e-2,
1069
+ 1e-1,
1063
1070
  1e-2,
1064
1071
  1e-2,
1065
1072
  marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
@@ -1071,8 +1078,8 @@ def run_mini_model(
1071
1078
  torch.bfloat16,
1072
1079
  1e-3,
1073
1080
  1e-2,
1074
- 1e-1,
1075
1081
  1e-2,
1082
+ 1e-1,
1076
1083
  1e-2,
1077
1084
  1e-2,
1078
1085
  marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
@@ -1159,8 +1166,8 @@ def test_mini_model(
1159
1166
  dtype,
1160
1167
  loss_atol,
1161
1168
  loss_rtol,
1162
- logits_atol,
1163
- logits_rtol,
1169
+ logprobs_atol,
1170
+ logprobs_rtol,
1164
1171
  param_atol,
1165
1172
  param_rtol,
1166
1173
  ):
@@ -1180,12 +1187,12 @@ def test_mini_model(
1180
1187
 
1181
1188
  # No logits are materialized
1182
1189
  # import pdb; pdb.set_trace()
1183
- # Compare the logits from the last step
1190
+ # Compare the topk logprobs from evaluation step
1184
1191
  assert_verbose_allclose(
1185
- expected_output["logits"],
1186
- actual_output["logits"],
1187
- atol=logits_atol,
1188
- rtol=logits_rtol,
1192
+ expected_output["topk_logprobs"],
1193
+ actual_output["topk_logprobs"],
1194
+ atol=logprobs_atol,
1195
+ rtol=logprobs_rtol,
1189
1196
  )
1190
1197
 
1191
1198
  # Compare the params from the last step
@@ -38,6 +38,8 @@ from liger_kernel.transformers import apply_liger_kernel_to_qwen3_moe
38
38
  from test.utils import DEFAULT_DATASET_PATH
39
39
  from test.utils import MiniModelConfig
40
40
  from test.utils import assert_verbose_allclose
41
+ from test.utils import get_logprobs
42
+ from test.utils import get_topk
41
43
  from test.utils import revert_liger_kernel_to_gemma
42
44
  from test.utils import revert_liger_kernel_to_gemma2
43
45
  from test.utils import revert_liger_kernel_to_gemma3_text
@@ -849,17 +851,17 @@ def run_mini_model(
849
851
  eval_output = model(**eval_batch)
850
852
  print(f"Eval Loss: {eval_output.loss.item()}")
851
853
  loss_list.append(eval_output.loss.item())
852
-
854
+ topk_logprobs = get_topk(get_logprobs(eval_output.logits))
853
855
  MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
854
856
  return {
855
857
  "loss": loss_list,
856
- "logits": eval_output.logits,
858
+ "topk_logprobs": topk_logprobs.values,
857
859
  "model": model,
858
860
  }
859
861
 
860
862
 
861
863
  @pytest.mark.parametrize(
862
- "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol",
864
+ "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
863
865
  [
864
866
  ("mini_llama3", 32, 1e-4, torch.float32, 1e-8, 2e-5, 1e-4, 1e-5, 5e-3, 1e-5),
865
867
  pytest.param(
@@ -1013,7 +1015,7 @@ def run_mini_model(
1013
1015
  # TODO: mixtral is flaky so disable the test for now
1014
1016
  # ("mini_mixtral", 32, 1e-4, torch.float32, 5e-4, 1e-4, 5e-3, 1e-5, 1e-2, 1e-5),
1015
1017
  # Gemma 1.1 and 2 has more tolerance because currently, the kernel is not a perfect match (casts are not done the same way)
1016
- ("mini_gemma1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
1018
+ ("mini_gemma1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-2, 5e-3, 1e-5),
1017
1019
  ("mini_gemma1.1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
1018
1020
  ("mini_gemma2", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
1019
1021
  pytest.param(
@@ -1041,8 +1043,8 @@ def test_mini_model(
1041
1043
  dtype,
1042
1044
  loss_atol,
1043
1045
  loss_rtol,
1044
- logits_atol,
1045
- logits_rtol,
1046
+ logprobs_atol,
1047
+ logprobs_rtol,
1046
1048
  param_atol,
1047
1049
  param_rtol,
1048
1050
  ):
@@ -1060,13 +1062,13 @@ def test_mini_model(
1060
1062
  rtol=loss_rtol,
1061
1063
  )
1062
1064
 
1063
- # Compare the logits from evaluation step
1064
- if expected_output["logits"] is not None and actual_output["logits"] is not None:
1065
+ # Compare the topk logprobs from evaluation step
1066
+ if expected_output["topk_logprobs"] is not None and actual_output["topk_logprobs"] is not None:
1065
1067
  assert_verbose_allclose(
1066
- expected_output["logits"],
1067
- actual_output["logits"],
1068
- atol=logits_atol,
1069
- rtol=logits_rtol,
1068
+ expected_output["topk_logprobs"],
1069
+ actual_output["topk_logprobs"],
1070
+ atol=logprobs_atol,
1071
+ rtol=logprobs_rtol,
1070
1072
  )
1071
1073
 
1072
1074
  # Compare the params from the last step
@@ -20,6 +20,8 @@ from test.utils import FAKE_CONFIGS_PATH
20
20
  from test.utils import UNTOKENIZED_DATASET_PATH
21
21
  from test.utils import MiniModelConfig
22
22
  from test.utils import assert_verbose_allclose
23
+ from test.utils import get_logprobs
24
+ from test.utils import get_topk
23
25
  from test.utils import is_torchvision_available
24
26
  from test.utils import load_image_processing_config
25
27
  from test.utils import load_processor_config
@@ -762,11 +764,16 @@ def run_mini_model_multimodal(
762
764
  print(f"Step {i}, Loss: {output.loss.item()}")
763
765
  loss_list.append(output.loss.item())
764
766
 
765
- return {"loss": loss_list, "logits": output.logits, "model": model}
767
+ topk_logprobs = get_topk(get_logprobs(output.logits))
768
+ return {
769
+ "loss": loss_list,
770
+ "topk_logprobs": topk_logprobs.values,
771
+ "model": model,
772
+ }
766
773
 
767
774
 
768
775
  @pytest.mark.parametrize(
769
- "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol",
776
+ "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
770
777
  [
771
778
  pytest.param(
772
779
  "mini_qwen2_vl",
@@ -875,10 +882,10 @@ def run_mini_model_multimodal(
875
882
  32,
876
883
  1e-4,
877
884
  torch.float32,
878
- 1e-8,
879
- 1e-5,
880
- 5e-3,
881
- 1e-5,
885
+ 1e-3,
886
+ 1e-3,
887
+ 1e-1,
888
+ 1e-1,
882
889
  5e-3,
883
890
  1e-5,
884
891
  marks=[
@@ -898,8 +905,8 @@ def test_mini_model_multimodal(
898
905
  dtype,
899
906
  loss_atol,
900
907
  loss_rtol,
901
- logits_atol,
902
- logits_rtol,
908
+ logprobs_atol,
909
+ logprobs_rtol,
903
910
  param_atol,
904
911
  param_rtol,
905
912
  ):
@@ -920,10 +927,10 @@ def test_mini_model_multimodal(
920
927
 
921
928
  # Compare the logits from the last step
922
929
  assert_verbose_allclose(
923
- expected_output["logits"],
924
- actual_output["logits"],
925
- atol=logits_atol,
926
- rtol=logits_rtol,
930
+ expected_output["topk_logprobs"],
931
+ actual_output["topk_logprobs"],
932
+ atol=logprobs_atol,
933
+ rtol=logprobs_rtol,
927
934
  )
928
935
 
929
936
  # Compare the params from the last step
@@ -38,6 +38,8 @@ from liger_kernel.transformers import apply_liger_kernel_to_qwen3_moe
38
38
  from test.utils import DEFAULT_DATASET_PATH
39
39
  from test.utils import MiniModelConfig
40
40
  from test.utils import assert_verbose_allclose
41
+ from test.utils import get_logprobs
42
+ from test.utils import get_topk
41
43
  from test.utils import revert_liger_kernel_to_gemma
42
44
  from test.utils import revert_liger_kernel_to_gemma2
43
45
  from test.utils import revert_liger_kernel_to_gemma3_text
@@ -841,12 +843,17 @@ def run_mini_model(
841
843
  print(f"Step {i}, Loss: {output.loss.item()}")
842
844
  loss_list.append(output.loss.item())
843
845
 
846
+ topk_logprobs = get_topk(get_logprobs(output.logits))
844
847
  MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
845
- return {"loss": loss_list, "logits": output.logits, "model": model}
848
+ return {
849
+ "loss": loss_list,
850
+ "topk_logprobs": topk_logprobs.values,
851
+ "model": model,
852
+ }
846
853
 
847
854
 
848
855
  @pytest.mark.parametrize(
849
- "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol",
856
+ "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
850
857
  [
851
858
  ("mini_llama3", 32, 1e-4, torch.float32, 1e-8, 2e-5, 1e-4, 1e-5, 5e-3, 1e-5),
852
859
  pytest.param(
@@ -1027,8 +1034,8 @@ def test_mini_model(
1027
1034
  dtype,
1028
1035
  loss_atol,
1029
1036
  loss_rtol,
1030
- logits_atol,
1031
- logits_rtol,
1037
+ logprobs_atol,
1038
+ logprobs_rtol,
1032
1039
  param_atol,
1033
1040
  param_rtol,
1034
1041
  ):
@@ -1048,12 +1055,11 @@ def test_mini_model(
1048
1055
 
1049
1056
  # No logits are materialized
1050
1057
  # import pdb; pdb.set_trace()
1051
- # Compare the logits from the last step
1052
1058
  assert_verbose_allclose(
1053
- expected_output["logits"],
1054
- actual_output["logits"],
1055
- atol=logits_atol,
1056
- rtol=logits_rtol,
1059
+ expected_output["topk_logprobs"],
1060
+ actual_output["topk_logprobs"],
1061
+ atol=logprobs_atol,
1062
+ rtol=logprobs_rtol,
1057
1063
  )
1058
1064
 
1059
1065
  # Compare the params from the last step
@@ -57,6 +57,17 @@ def set_seed(seed=42):
57
57
  os.environ["PYTHONHASHSEED"] = str(seed)
58
58
 
59
59
 
60
+ @torch.no_grad
61
+ def get_logprobs(tensor):
62
+ return torch.nn.functional.log_softmax(tensor, dim=-1, dtype=torch.float32)
63
+
64
+
65
+ @torch.no_grad
66
+ def get_topk(tensor, k=20):
67
+ topk = torch.topk(tensor, k, dim=-1)
68
+ return topk
69
+
70
+
60
71
  def assert_verbose_allclose(tensor1, tensor2, rtol=1e-05, atol=1e-08, max_print=5):
61
72
  """
62
73
  Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.