liger-kernel-nightly 0.6.1.dev20250805235815__tar.gz → 0.6.1.dev20250809233744__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/PKG-INFO +1 -1
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/data/all_benchmark_data.csv +65 -1
- liger_kernel_nightly-0.6.1.dev20250809233744/benchmark/scripts/benchmark_llama4_rope.py +249 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/pyproject.toml +1 -1
- liger_kernel_nightly-0.6.1.dev20250809233744/src/liger_kernel/ops/llama4_rope.py +225 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/__init__.py +4 -0
- liger_kernel_nightly-0.6.1.dev20250809233744/src/liger_kernel/transformers/llama4_rope.py +93 -0
- liger_kernel_nightly-0.6.1.dev20250809233744/src/liger_kernel/transformers/model/phi3.py +112 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/monkey_patch.py +10 -20
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel_nightly.egg-info/PKG-INFO +1 -1
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel_nightly.egg-info/SOURCES.txt +3 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/convergence/bf16/test_mini_models.py +1 -1
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/convergence/bf16/test_mini_models_multimodal.py +1 -2
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/convergence/fp32/test_mini_models.py +1 -1
- liger_kernel_nightly-0.6.1.dev20250805235815/src/liger_kernel/transformers/model/phi3.py +0 -249
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/.github/pull_request_template.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/.github/workflows/amd-ci.yml +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/.github/workflows/benchmark.yml +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/.github/workflows/docs.yml +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/.github/workflows/intel-ci.yml +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/.github/workflows/nvi-ci.yml +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/.github/workflows/publish-nightly.yml +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/.github/workflows/publish-release.yml +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/.gitignore +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/LICENSE +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/Makefile +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/NOTICE +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/README.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/README.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/__init__.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/benchmarks_visualizer.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/__init__.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_dyt.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_embedding.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_geglu.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_group_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_jsd.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_kl_div.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_rope.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_softmax.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_swiglu.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/benchmark_tvd.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/benchmark/scripts/utils.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/dev/fmt-requirements.txt +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/dev/modal/benchmarks.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/dev/modal/tests.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/dev/modal/tests_bwd.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/Examples.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/Getting-Started.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/High-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/Low-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/acknowledgement.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/contributing.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/images/banner.GIF +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/images/compose.gif +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/images/e2e-memory.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/images/e2e-tps.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/images/logo-banner.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/images/patch.gif +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/images/post-training.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/index.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/docs/license.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/alignment/accelerate_config.yaml +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/alignment/run_orpo.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/README.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/callback.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/config/fsdp_config.json +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/img/gemma_7b_mem.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/img/gemma_7b_tp.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/img/llama_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/img/llama_tps.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/img/qwen_tps.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/launch_on_modal.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/run_benchmarks.sh +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/run_gemma.sh +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/run_llama.sh +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/run_qwen.sh +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/run_qwen2_vl.sh +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/training.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/huggingface/training_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/lightning/README.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/lightning/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/lightning/training.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/README.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/callback.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/medusa_util.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/examples/medusa/train.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/licenses/LICENSE-Apache-2.0 +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/licenses/LICENSE-MIT-AutoAWQ +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/licenses/LICENSE-MIT-llmc +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/licenses/LICENSE-MIT-triton +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/mkdocs.yml +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/setup.cfg +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/setup.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/__init__.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/README.md +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/functional.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/env_report.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/__init__.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/dyt.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/geglu.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/group_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/jsd.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/kl_div.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/rope.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/softmax.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/swiglu.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/tvd.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/ops/utils.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/auto_model.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/dyt.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/fsdp.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/functional.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/geglu.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/group_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/jsd.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/kl_div.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/__init__.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/gemma.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/gemma2.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/gemma3.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/glm4.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/llama.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/llama4.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/llava.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/mistral.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/mixtral.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/mllama.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/olmo2.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/paligemma.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/qwen2.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/qwen3.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/model/smollm3.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/rope.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/softmax.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/swiglu.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/trainer_integration.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/transformers/tvd.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/triton/__init__.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/triton/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel/utils.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/__init__.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/chunked_loss/test_cosine_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/chunked_loss/test_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/chunked_loss/test_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/chunked_loss/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/chunked_loss/test_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/chunked_loss/test_kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/chunked_loss/test_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/chunked_loss/test_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/conftest.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/convergence/__init__.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/convergence/bf16/__init__.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/convergence/bf16/test_mini_models_with_logits.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/convergence/fp32/__init__.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/convergence/fp32/test_mini_models_with_logits.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/resources/tiny_shakespeare.txt +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_auto_model.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_dyt.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_embedding.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_flex_attention.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_geglu.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_group_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_jsd.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_kl_div.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_monkey_patch.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_rope.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_softmax.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_swiglu.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_trainer_integration.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_transformers.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/transformers/test_tvd.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/triton/test_triton_monkey_patch.py +0 -0
- {liger_kernel_nightly-0.6.1.dev20250805235815 → liger_kernel_nightly-0.6.1.dev20250809233744}/test/utils.py +0 -0
@@ -1574,4 +1574,68 @@ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,2048,208.06298828
|
|
1574
1574
|
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,4096,416.11767578125,416.11767578125,416.11767578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
1575
1575
|
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,8192,832.22705078125,832.22705078125,832.22705078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
1576
1576
|
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,16384,1544.44580078125,1544.44580078125,1544.44580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
1577
|
-
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
1577
|
+
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
1578
|
+
llama4_rope,liger,forward,speed,ms,H,hidden size,512,0.08249600231647491,0.08102399855852127,0.08432000130414963,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
|
1579
|
+
llama4_rope,liger,forward,speed,ms,H,hidden size,2048,0.08169600367546082,0.08037760108709335,0.08329600095748901,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
|
1580
|
+
llama4_rope,liger,forward,speed,ms,H,hidden size,8192,0.08128000050783157,0.07980799674987793,0.08329600095748901,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
|
1581
|
+
llama4_rope,huggingface,forward,speed,ms,H,hidden size,512,0.03759999945759773,0.03612799942493439,0.03907199949026108,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:03,0.6.1
|
1582
|
+
llama4_rope,huggingface,forward,speed,ms,H,hidden size,2048,0.06185600161552429,0.061267200857400894,0.06252799928188324,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:03,0.6.1
|
1583
|
+
llama4_rope,huggingface,forward,speed,ms,H,hidden size,8192,0.206496000289917,0.20582400262355804,0.20716799795627594,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:03,0.6.1
|
1584
|
+
llama4_rope,liger,backward,speed,ms,H,hidden size,512,0.15404799580574036,0.15241600573062897,0.15615999698638916,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:04,0.6.1
|
1585
|
+
llama4_rope,liger,backward,speed,ms,H,hidden size,2048,0.1536320000886917,0.15190400183200836,0.1558080017566681,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:04,0.6.1
|
1586
|
+
llama4_rope,liger,backward,speed,ms,H,hidden size,8192,0.15263999998569489,0.15094399452209473,0.15491199493408203,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:04,0.6.1
|
1587
|
+
llama4_rope,huggingface,backward,speed,ms,H,hidden size,512,0.13760000467300415,0.13574400544166565,0.14009599387645721,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:05,0.6.1
|
1588
|
+
llama4_rope,huggingface,backward,speed,ms,H,hidden size,2048,0.13600000739097595,0.13449600338935852,0.1382720023393631,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:05,0.6.1
|
1589
|
+
llama4_rope,huggingface,backward,speed,ms,H,hidden size,8192,0.21011200547218323,0.20924800634384155,0.21110400557518005,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:05,0.6.1
|
1590
|
+
llama4_rope,liger,full,speed,ms,H,hidden size,512,0.3652159869670868,0.3619840145111084,0.3699840009212494,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:07,0.6.1
|
1591
|
+
llama4_rope,liger,full,speed,ms,H,hidden size,2048,0.3599040061235428,0.2881920039653778,0.36559998989105225,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:07,0.6.1
|
1592
|
+
llama4_rope,liger,full,speed,ms,H,hidden size,8192,0.2874239981174469,0.2852480113506317,0.29029120206832887,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:07,0.6.1
|
1593
|
+
llama4_rope,huggingface,full,speed,ms,H,hidden size,512,0.24691200256347656,0.24489599466323853,0.24961919784545897,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
|
1594
|
+
llama4_rope,huggingface,full,speed,ms,H,hidden size,2048,0.24774399399757385,0.24582399427890778,0.2505407989025116,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
|
1595
|
+
llama4_rope,huggingface,full,speed,ms,H,hidden size,8192,0.41414400935173035,0.41337600350379944,0.41491198539733887,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
|
1596
|
+
llama4_rope,liger,full,memory,MB,H,hidden size,512,37.23486328125,37.23486328125,37.23486328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
|
1597
|
+
llama4_rope,liger,full,memory,MB,H,hidden size,2048,52.89111328125,52.89111328125,52.89111328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
|
1598
|
+
llama4_rope,liger,full,memory,MB,H,hidden size,8192,115.51611328125,115.51611328125,115.51611328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
|
1599
|
+
llama4_rope,huggingface,full,memory,MB,H,hidden size,512,49.64111328125,49.64111328125,49.64111328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
|
1600
|
+
llama4_rope,huggingface,full,memory,MB,H,hidden size,2048,102.51611328125,102.51611328125,102.51611328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
|
1601
|
+
llama4_rope,huggingface,full,memory,MB,H,hidden size,8192,314.01611328125,314.01611328125,314.01611328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
|
1602
|
+
llama4_rope,liger,forward,speed,ms,T,sequence length,1024,0.07417599856853485,0.07248000055551529,0.07596799731254578,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
|
1603
|
+
llama4_rope,liger,forward,speed,ms,T,sequence length,2048,0.08182399719953537,0.08006399869918823,0.08380799740552902,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
|
1604
|
+
llama4_rope,liger,forward,speed,ms,T,sequence length,4096,0.11708799749612808,0.1167680025100708,0.11744000017642975,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
|
1605
|
+
llama4_rope,liger,forward,speed,ms,T,sequence length,8192,0.2165440022945404,0.21596799790859222,0.21715199947357178,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
|
1606
|
+
llama4_rope,liger,forward,speed,ms,T,sequence length,16384,0.41756799817085266,0.41705599427223206,0.41811200976371765,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
|
1607
|
+
llama4_rope,huggingface,forward,speed,ms,T,sequence length,1024,0.11644800007343292,0.11590400338172913,0.11708799749612808,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
|
1608
|
+
llama4_rope,huggingface,forward,speed,ms,T,sequence length,2048,0.20659199357032776,0.20608000457286835,0.2072640061378479,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
|
1609
|
+
llama4_rope,huggingface,forward,speed,ms,T,sequence length,4096,0.38553598523139954,0.3846847891807556,0.38624000549316406,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
|
1610
|
+
llama4_rope,huggingface,forward,speed,ms,T,sequence length,8192,0.7411519885063171,0.7403839826583862,0.7420480251312256,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
|
1611
|
+
llama4_rope,huggingface,forward,speed,ms,T,sequence length,16384,1.4553920030593872,1.4543871641159059,1.4562879800796509,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
|
1612
|
+
llama4_rope,liger,backward,speed,ms,T,sequence length,1024,0.11840000003576279,0.11711999773979187,0.12031999975442886,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
|
1613
|
+
llama4_rope,liger,backward,speed,ms,T,sequence length,2048,0.12336000055074692,0.12198399752378464,0.12489599734544754,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
|
1614
|
+
llama4_rope,liger,backward,speed,ms,T,sequence length,4096,0.12380799651145935,0.12240000069141388,0.12559999525547028,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
|
1615
|
+
llama4_rope,liger,backward,speed,ms,T,sequence length,8192,0.2170879989862442,0.2165759950876236,0.21753600239753723,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
|
1616
|
+
llama4_rope,liger,backward,speed,ms,T,sequence length,16384,0.4175359904766083,0.41705599427223206,0.4181375920772552,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
|
1617
|
+
llama4_rope,huggingface,backward,speed,ms,T,sequence length,1024,0.1189119964838028,0.11769600212574005,0.12003199756145477,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
|
1618
|
+
llama4_rope,huggingface,backward,speed,ms,T,sequence length,2048,0.21011200547218323,0.20927999913692474,0.21119999885559082,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
|
1619
|
+
llama4_rope,huggingface,backward,speed,ms,T,sequence length,4096,0.39740800857543945,0.3963199853897095,0.39824000000953674,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
|
1620
|
+
llama4_rope,huggingface,backward,speed,ms,T,sequence length,8192,0.7540159821510315,0.7528960108757019,0.7550719976425171,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
|
1621
|
+
llama4_rope,huggingface,backward,speed,ms,T,sequence length,16384,1.4822720289230347,1.4810559749603271,1.4833600521087646,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
|
1622
|
+
llama4_rope,liger,full,speed,ms,T,sequence length,1024,0.2874400019645691,0.2853440046310425,0.29052799940109253,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
|
1623
|
+
llama4_rope,liger,full,speed,ms,T,sequence length,2048,0.28646400570869446,0.2845759987831116,0.28963199257850647,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
|
1624
|
+
llama4_rope,liger,full,speed,ms,T,sequence length,4096,0.29897600412368774,0.29660800099372864,0.302131199836731,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
|
1625
|
+
llama4_rope,liger,full,speed,ms,T,sequence length,8192,0.4315840005874634,0.4304639995098114,0.43270400166511536,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
|
1626
|
+
llama4_rope,liger,full,speed,ms,T,sequence length,16384,0.833184003829956,0.8322240114212036,0.8345024228096007,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
|
1627
|
+
llama4_rope,huggingface,full,speed,ms,T,sequence length,1024,0.24592000246047974,0.24396799504756927,0.24876800179481506,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
1628
|
+
llama4_rope,huggingface,full,speed,ms,T,sequence length,2048,0.4138239920139313,0.41308799386024475,0.4145599901676178,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
1629
|
+
llama4_rope,huggingface,full,speed,ms,T,sequence length,4096,0.7800959944725037,0.7790719866752625,0.7810239791870117,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
1630
|
+
llama4_rope,huggingface,full,speed,ms,T,sequence length,8192,1.4911680221557617,1.4902976036071778,1.4922879934310913,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
1631
|
+
llama4_rope,huggingface,full,speed,ms,T,sequence length,16384,2.9344160556793213,2.9333438873291016,2.9353599548339844,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
1632
|
+
llama4_rope,liger,full,memory,MB,T,sequence length,1024,73.75830078125,73.75830078125,73.75830078125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
1633
|
+
llama4_rope,liger,full,memory,MB,T,sequence length,2048,115.51611328125,115.51611328125,115.51611328125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
1634
|
+
llama4_rope,liger,full,memory,MB,T,sequence length,4096,199.03173828125,199.03173828125,199.03173828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
1635
|
+
llama4_rope,liger,full,memory,MB,T,sequence length,8192,366.06298828125,366.06298828125,366.06298828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
1636
|
+
llama4_rope,liger,full,memory,MB,T,sequence length,16384,700.12548828125,700.12548828125,700.12548828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
1637
|
+
llama4_rope,huggingface,full,memory,MB,T,sequence length,1024,173.00830078125,173.00830078125,173.00830078125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
1638
|
+
llama4_rope,huggingface,full,memory,MB,T,sequence length,2048,314.01611328125,314.01611328125,314.01611328125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
1639
|
+
llama4_rope,huggingface,full,memory,MB,T,sequence length,4096,596.03173828125,596.03173828125,596.03173828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
1640
|
+
llama4_rope,huggingface,full,memory,MB,T,sequence length,8192,1160.06298828125,1160.06298828125,1160.06298828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
1641
|
+
llama4_rope,huggingface,full,memory,MB,T,sequence length,16384,2288.12548828125,2288.12548828125,2288.12548828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
@@ -0,0 +1,249 @@
|
|
1
|
+
import torch
|
2
|
+
import triton
|
3
|
+
|
4
|
+
from transformers.models.llama4.configuration_llama4 import Llama4TextConfig
|
5
|
+
from transformers.models.llama4.modeling_llama4 import Llama4TextRotaryEmbedding
|
6
|
+
from transformers.models.llama4.modeling_llama4 import apply_rotary_emb
|
7
|
+
from utils import QUANTILES
|
8
|
+
from utils import SingleBenchmarkRunInput
|
9
|
+
from utils import SingleBenchmarkRunOutput
|
10
|
+
from utils import _test_memory
|
11
|
+
from utils import parse_benchmark_script_args
|
12
|
+
from utils import run_benchmarks
|
13
|
+
|
14
|
+
from liger_kernel.transformers.llama4_rope import liger_llama4_text_rotary_pos_emb
|
15
|
+
from liger_kernel.utils import infer_device
|
16
|
+
from liger_kernel.utils import transformers_version_dispatch
|
17
|
+
|
18
|
+
device = infer_device()
|
19
|
+
|
20
|
+
|
21
|
+
def bench_speed_llama4_rope(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
|
22
|
+
provider = input.kernel_provider
|
23
|
+
mode = input.kernel_operation_mode
|
24
|
+
|
25
|
+
extra_benchmark_config = input.extra_benchmark_config
|
26
|
+
num_q_heads = extra_benchmark_config["num_q_heads"]
|
27
|
+
num_kv_heads = extra_benchmark_config["num_kv_heads"]
|
28
|
+
dtype = extra_benchmark_config["dtype"]
|
29
|
+
|
30
|
+
# x can be either hidden_size or seq_len
|
31
|
+
hidden_size = extra_benchmark_config["hidden_size"] if "hidden_size" in extra_benchmark_config else input.x
|
32
|
+
seq_len = extra_benchmark_config["seq_len"] if "seq_len" in extra_benchmark_config else input.x
|
33
|
+
|
34
|
+
head_dim = hidden_size // num_q_heads
|
35
|
+
|
36
|
+
# Create Llama4TextConfig for the rotary embedding
|
37
|
+
config = Llama4TextConfig(
|
38
|
+
hidden_size=hidden_size,
|
39
|
+
num_attention_heads=num_q_heads,
|
40
|
+
num_key_value_heads=num_kv_heads,
|
41
|
+
head_dim=head_dim,
|
42
|
+
max_position_embeddings=seq_len,
|
43
|
+
rope_theta=10000.0,
|
44
|
+
rope_scaling=None, # Use default rope type
|
45
|
+
)
|
46
|
+
|
47
|
+
rotary_emb = transformers_version_dispatch(
|
48
|
+
"4.48.0",
|
49
|
+
Llama4TextRotaryEmbedding,
|
50
|
+
Llama4TextRotaryEmbedding,
|
51
|
+
before_kwargs={"config": config, "device": device},
|
52
|
+
after_kwargs={"config": config, "device": device},
|
53
|
+
)
|
54
|
+
|
55
|
+
q = torch.randn(
|
56
|
+
(1, seq_len, num_q_heads, head_dim),
|
57
|
+
device=device,
|
58
|
+
requires_grad=True,
|
59
|
+
dtype=dtype,
|
60
|
+
)
|
61
|
+
k = torch.randn(
|
62
|
+
(1, seq_len, num_kv_heads, head_dim),
|
63
|
+
device=device,
|
64
|
+
requires_grad=True,
|
65
|
+
dtype=dtype,
|
66
|
+
)
|
67
|
+
dq, dk = (
|
68
|
+
torch.randn_like(q, device=device, dtype=dtype),
|
69
|
+
torch.randn_like(k, device=device),
|
70
|
+
)
|
71
|
+
pos_ids = torch.arange(seq_len, device=device, dtype=torch.long).unsqueeze(0)
|
72
|
+
freqs_cis = rotary_emb(q, pos_ids)
|
73
|
+
|
74
|
+
def fwd():
|
75
|
+
if provider == "liger":
|
76
|
+
return liger_llama4_text_rotary_pos_emb(q, k, freqs_cis)
|
77
|
+
elif provider == "huggingface":
|
78
|
+
return apply_rotary_emb(q, k, freqs_cis)
|
79
|
+
else:
|
80
|
+
raise ValueError(f"Invalid provider: {provider} for Llama4 RoPE embedding")
|
81
|
+
|
82
|
+
if mode == "forward":
|
83
|
+
ms_50, ms_20, ms_80 = triton.testing.do_bench(
|
84
|
+
fwd,
|
85
|
+
grad_to_none=[q, k],
|
86
|
+
rep=400,
|
87
|
+
quantiles=QUANTILES,
|
88
|
+
)
|
89
|
+
elif mode == "backward":
|
90
|
+
q_out, k_out = fwd()
|
91
|
+
ms_50, ms_20, ms_80 = triton.testing.do_bench(
|
92
|
+
lambda: torch.autograd.grad((q_out, k_out), (q, k), (dq, dk), allow_unused=True, retain_graph=True),
|
93
|
+
grad_to_none=[q, k],
|
94
|
+
rep=400,
|
95
|
+
quantiles=QUANTILES,
|
96
|
+
)
|
97
|
+
elif mode == "full":
|
98
|
+
|
99
|
+
def full():
|
100
|
+
q_out, k_out = fwd()
|
101
|
+
torch.autograd.grad((q_out, k_out), (q, k), (dq, dk), allow_unused=True)
|
102
|
+
|
103
|
+
ms_50, ms_20, ms_80 = triton.testing.do_bench(
|
104
|
+
full,
|
105
|
+
grad_to_none=[q, k],
|
106
|
+
rep=400,
|
107
|
+
quantiles=QUANTILES,
|
108
|
+
)
|
109
|
+
return SingleBenchmarkRunOutput(
|
110
|
+
y_20=ms_20,
|
111
|
+
y_50=ms_50,
|
112
|
+
y_80=ms_80,
|
113
|
+
)
|
114
|
+
|
115
|
+
|
116
|
+
def bench_memory_llama4_rope(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
|
117
|
+
provider = input.kernel_provider
|
118
|
+
|
119
|
+
extra_benchmark_config = input.extra_benchmark_config
|
120
|
+
num_q_heads = extra_benchmark_config["num_q_heads"]
|
121
|
+
num_kv_heads = extra_benchmark_config["num_kv_heads"]
|
122
|
+
dtype = extra_benchmark_config["dtype"]
|
123
|
+
|
124
|
+
# x can be either hidden_size or seq_len
|
125
|
+
hidden_size = extra_benchmark_config["hidden_size"] if "hidden_size" in extra_benchmark_config else input.x
|
126
|
+
seq_len = extra_benchmark_config["seq_len"] if "seq_len" in extra_benchmark_config else input.x
|
127
|
+
|
128
|
+
head_dim = hidden_size // num_q_heads
|
129
|
+
|
130
|
+
# Create Llama4TextConfig for the rotary embedding
|
131
|
+
config = Llama4TextConfig(
|
132
|
+
hidden_size=hidden_size,
|
133
|
+
num_attention_heads=num_q_heads,
|
134
|
+
num_key_value_heads=num_kv_heads,
|
135
|
+
head_dim=head_dim,
|
136
|
+
max_position_embeddings=seq_len,
|
137
|
+
rope_theta=10000.0,
|
138
|
+
rope_scaling=None, # Use default rope type
|
139
|
+
)
|
140
|
+
|
141
|
+
rotary_emb = transformers_version_dispatch(
|
142
|
+
"4.48.0",
|
143
|
+
Llama4TextRotaryEmbedding,
|
144
|
+
Llama4TextRotaryEmbedding,
|
145
|
+
before_kwargs={"config": config, "device": device},
|
146
|
+
after_kwargs={"config": config, "device": device},
|
147
|
+
)
|
148
|
+
|
149
|
+
q = torch.randn(
|
150
|
+
(1, seq_len, num_q_heads, head_dim),
|
151
|
+
device=device,
|
152
|
+
requires_grad=True,
|
153
|
+
dtype=dtype,
|
154
|
+
)
|
155
|
+
k = torch.randn(
|
156
|
+
(1, seq_len, num_kv_heads, head_dim),
|
157
|
+
device=device,
|
158
|
+
requires_grad=True,
|
159
|
+
dtype=dtype,
|
160
|
+
)
|
161
|
+
dq, dk = (
|
162
|
+
torch.randn_like(q, device=device, dtype=dtype),
|
163
|
+
torch.randn_like(k, device=device),
|
164
|
+
)
|
165
|
+
pos_ids = torch.arange(seq_len, device=device, dtype=torch.long).unsqueeze(0)
|
166
|
+
freqs_cis = rotary_emb(q, pos_ids)
|
167
|
+
|
168
|
+
def full():
|
169
|
+
if provider == "liger":
|
170
|
+
q_out, k_out = liger_llama4_text_rotary_pos_emb(q, k, freqs_cis)
|
171
|
+
else:
|
172
|
+
q_out, k_out = apply_rotary_emb(q, k, freqs_cis)
|
173
|
+
torch.autograd.grad((q_out, k_out), (q, k), (dq, dk), allow_unused=True, retain_graph=True)
|
174
|
+
|
175
|
+
mem_50, mem_20, mem_80 = _test_memory(
|
176
|
+
full,
|
177
|
+
quantiles=QUANTILES,
|
178
|
+
)
|
179
|
+
return SingleBenchmarkRunOutput(
|
180
|
+
y_20=mem_20,
|
181
|
+
y_50=mem_50,
|
182
|
+
y_80=mem_80,
|
183
|
+
)
|
184
|
+
|
185
|
+
|
186
|
+
if __name__ == "__main__":
|
187
|
+
args = parse_benchmark_script_args()
|
188
|
+
|
189
|
+
common_configs_varying_hidden_size = {
|
190
|
+
"kernel_name": "llama4_rope",
|
191
|
+
"x_name": "H",
|
192
|
+
"x_label": "hidden size",
|
193
|
+
"x_values": [32 * (2**i) for i in range(4, 10, 2)],
|
194
|
+
"kernel_providers": ["liger", "huggingface"],
|
195
|
+
"extra_benchmark_configs": [
|
196
|
+
{
|
197
|
+
"dtype": torch.bfloat16,
|
198
|
+
"seq_len": 2048,
|
199
|
+
"num_q_heads": 32,
|
200
|
+
"num_kv_heads": 8,
|
201
|
+
}
|
202
|
+
],
|
203
|
+
"overwrite": args.overwrite,
|
204
|
+
}
|
205
|
+
run_benchmarks(
|
206
|
+
bench_test_fn=bench_speed_llama4_rope,
|
207
|
+
kernel_operation_modes=["forward", "backward", "full"],
|
208
|
+
metric_name="speed",
|
209
|
+
metric_unit="ms",
|
210
|
+
**common_configs_varying_hidden_size,
|
211
|
+
)
|
212
|
+
run_benchmarks(
|
213
|
+
bench_test_fn=bench_memory_llama4_rope,
|
214
|
+
kernel_operation_modes=["full"],
|
215
|
+
metric_name="memory",
|
216
|
+
metric_unit="MB",
|
217
|
+
**common_configs_varying_hidden_size,
|
218
|
+
)
|
219
|
+
|
220
|
+
common_configs_varying_seq_len = {
|
221
|
+
"kernel_name": "llama4_rope",
|
222
|
+
"x_name": "T",
|
223
|
+
"x_label": "sequence length",
|
224
|
+
"x_values": [2**i for i in range(10, 15)],
|
225
|
+
"kernel_providers": ["liger", "huggingface"],
|
226
|
+
"extra_benchmark_configs": [
|
227
|
+
{
|
228
|
+
"dtype": torch.bfloat16,
|
229
|
+
"hidden_size": 8192,
|
230
|
+
"num_q_heads": 32,
|
231
|
+
"num_kv_heads": 8,
|
232
|
+
}
|
233
|
+
],
|
234
|
+
"overwrite": args.overwrite,
|
235
|
+
}
|
236
|
+
run_benchmarks(
|
237
|
+
bench_test_fn=bench_speed_llama4_rope,
|
238
|
+
kernel_operation_modes=["forward", "backward", "full"],
|
239
|
+
metric_name="speed",
|
240
|
+
metric_unit="ms",
|
241
|
+
**common_configs_varying_seq_len,
|
242
|
+
)
|
243
|
+
run_benchmarks(
|
244
|
+
bench_test_fn=bench_memory_llama4_rope,
|
245
|
+
kernel_operation_modes=["full"],
|
246
|
+
metric_name="memory",
|
247
|
+
metric_unit="MB",
|
248
|
+
**common_configs_varying_seq_len,
|
249
|
+
)
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "liger_kernel_nightly"
|
7
|
-
version = "0.6.1.
|
7
|
+
version = "0.6.1.dev20250809233744"
|
8
8
|
description = "Efficient Triton kernels for LLM Training"
|
9
9
|
urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
@@ -0,0 +1,225 @@
|
|
1
|
+
import torch
|
2
|
+
import triton
|
3
|
+
import triton.language as tl
|
4
|
+
|
5
|
+
|
6
|
+
def _prepare_freqs(freqs_cis: torch.Tensor, seq_len: int, head_dim_half: int):
|
7
|
+
# Split or unpack complex frequencies into real and imag parts
|
8
|
+
if freqs_cis.is_complex():
|
9
|
+
freqs_real = freqs_cis.real
|
10
|
+
freqs_imag = freqs_cis.imag
|
11
|
+
else:
|
12
|
+
# Already split: last dim should be 2*head_dim_half
|
13
|
+
if freqs_cis.shape[-1] == 2 * head_dim_half:
|
14
|
+
freqs_real = freqs_cis[..., :head_dim_half]
|
15
|
+
freqs_imag = freqs_cis[..., head_dim_half:]
|
16
|
+
else:
|
17
|
+
raise ValueError(
|
18
|
+
f"Unexpected freqs_cis shape for non-complex input: {freqs_cis.shape}, expected last dim = {2 * head_dim_half}"
|
19
|
+
)
|
20
|
+
|
21
|
+
# Canonicalize to shape (seq_len, head_dim_half):
|
22
|
+
# 1) Ensure the last dimension is head_dim_half
|
23
|
+
if freqs_real.shape[-1] != head_dim_half:
|
24
|
+
raise ValueError(f"Unexpected last dim for freqs: {freqs_real.shape[-1]} (expected {head_dim_half})")
|
25
|
+
# 2) Flatten all leading dims to a single row dimension
|
26
|
+
freqs_real = freqs_real.reshape(-1, head_dim_half)
|
27
|
+
freqs_imag = freqs_imag.reshape(-1, head_dim_half)
|
28
|
+
# 3) If we have fewer rows than seq_len, allow broadcasting when single row
|
29
|
+
if freqs_real.shape[0] < seq_len:
|
30
|
+
if freqs_real.shape[0] == 1:
|
31
|
+
freqs_real = freqs_real.expand(seq_len, -1)
|
32
|
+
freqs_imag = freqs_imag.expand(seq_len, -1)
|
33
|
+
else:
|
34
|
+
raise ValueError(f"Insufficient rows in freqs: {freqs_real.shape[0]} < seq_len={seq_len}")
|
35
|
+
# 4) If we have more rows than seq_len (e.g., batch present), take the first seq_len rows
|
36
|
+
elif freqs_real.shape[0] > seq_len:
|
37
|
+
freqs_real = freqs_real[:seq_len]
|
38
|
+
freqs_imag = freqs_imag[:seq_len]
|
39
|
+
|
40
|
+
return freqs_real, freqs_imag
|
41
|
+
|
42
|
+
|
43
|
+
def _maybe_to_dtype(t: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
|
44
|
+
return t if t.dtype == dtype else t.to(dtype)
|
45
|
+
|
46
|
+
|
47
|
+
def _maybe_contiguous(t: torch.Tensor) -> torch.Tensor:
|
48
|
+
return t if t.is_contiguous() else t.contiguous()
|
49
|
+
|
50
|
+
|
51
|
+
def _cast_and_contiguous(q, k, freqs_real, freqs_imag):
|
52
|
+
# Choose compute dtype: use fp32 only when inputs are fp32; otherwise keep input dtype for performance
|
53
|
+
compute_dtype = torch.float32 if q.dtype == torch.float32 else q.dtype
|
54
|
+
|
55
|
+
# Make sure q/k share the same dtype before casting to compute dtype
|
56
|
+
if k.dtype != q.dtype:
|
57
|
+
k = k.to(q.dtype)
|
58
|
+
|
59
|
+
q = _maybe_contiguous(_maybe_to_dtype(q, compute_dtype))
|
60
|
+
k = _maybe_contiguous(_maybe_to_dtype(k, compute_dtype))
|
61
|
+
freqs_real = _maybe_contiguous(_maybe_to_dtype(freqs_real, compute_dtype))
|
62
|
+
freqs_imag = _maybe_contiguous(_maybe_to_dtype(freqs_imag, compute_dtype))
|
63
|
+
return q, k, freqs_real, freqs_imag
|
64
|
+
|
65
|
+
|
66
|
+
@triton.jit
|
67
|
+
def _llama4_rope_kernel(
|
68
|
+
q_ptr,
|
69
|
+
k_ptr,
|
70
|
+
freqs_real_ptr,
|
71
|
+
freqs_imag_ptr,
|
72
|
+
q_row_stride,
|
73
|
+
k_row_stride,
|
74
|
+
q_head_stride,
|
75
|
+
k_head_stride,
|
76
|
+
freqs_row_stride,
|
77
|
+
seq_len,
|
78
|
+
batch_size,
|
79
|
+
imag_sign,
|
80
|
+
head_dim_half: tl.constexpr,
|
81
|
+
n_q_heads: tl.constexpr,
|
82
|
+
n_k_heads: tl.constexpr,
|
83
|
+
BLOCK_SIZE: tl.constexpr,
|
84
|
+
):
|
85
|
+
"""
|
86
|
+
H100-optimized RoPE kernel with improved parallelization across heads and dimensions.
|
87
|
+
Grid: (batch*seq, head)
|
88
|
+
"""
|
89
|
+
# 2D grid
|
90
|
+
pid_bs = tl.program_id(0) # over batch*seq
|
91
|
+
pid_h = tl.program_id(1) # over heads
|
92
|
+
|
93
|
+
batch_idx = pid_bs // seq_len
|
94
|
+
seq_idx = pid_bs % seq_len
|
95
|
+
|
96
|
+
# Bounds check
|
97
|
+
if batch_idx >= batch_size or seq_idx >= seq_len:
|
98
|
+
return
|
99
|
+
|
100
|
+
# Base pointers for this (batch, seq) position
|
101
|
+
base_offset = batch_idx * seq_len + seq_idx
|
102
|
+
q_base = q_ptr + base_offset * q_row_stride
|
103
|
+
k_base = k_ptr + base_offset * k_row_stride
|
104
|
+
|
105
|
+
# Tiling over dim/2
|
106
|
+
for d_start in tl.static_range(0, head_dim_half, BLOCK_SIZE):
|
107
|
+
d_indices = d_start + tl.arange(0, BLOCK_SIZE)
|
108
|
+
mask_d = d_indices < head_dim_half
|
109
|
+
|
110
|
+
# Load frequencies once per tile (freqs layout: [seq_len, head_dim_half])
|
111
|
+
freq_idx = d_indices
|
112
|
+
freqs_real = tl.load(freqs_real_ptr + seq_idx * freqs_row_stride + freq_idx, mask=mask_d, other=0.0)
|
113
|
+
freqs_imag = tl.load(freqs_imag_ptr + seq_idx * freqs_row_stride + freq_idx, mask=mask_d, other=0.0)
|
114
|
+
freqs_imag = freqs_imag * imag_sign
|
115
|
+
|
116
|
+
# Process one query head per program in pid_h
|
117
|
+
if pid_h < n_q_heads:
|
118
|
+
q_head_ptr = q_base + pid_h * q_head_stride
|
119
|
+
q_real = tl.load(q_head_ptr + d_indices * 2, mask=mask_d, other=0.0)
|
120
|
+
q_imag = tl.load(q_head_ptr + d_indices * 2 + 1, mask=mask_d, other=0.0)
|
121
|
+
|
122
|
+
# Complex multiply with FMAs: (a+ib)*(c+i d) = (a*c - b*d) + i(a*d + b*c)
|
123
|
+
new_q_real = tl.math.fma(q_real, freqs_real, -(q_imag * freqs_imag))
|
124
|
+
new_q_imag = tl.math.fma(q_real, freqs_imag, q_imag * freqs_real)
|
125
|
+
|
126
|
+
tl.store(q_head_ptr + d_indices * 2, new_q_real, mask=mask_d)
|
127
|
+
tl.store(q_head_ptr + d_indices * 2 + 1, new_q_imag, mask=mask_d)
|
128
|
+
|
129
|
+
# Process one key head per program in pid_h
|
130
|
+
if pid_h < n_k_heads:
|
131
|
+
k_head_ptr = k_base + pid_h * k_head_stride
|
132
|
+
k_real = tl.load(k_head_ptr + d_indices * 2, mask=mask_d, other=0.0)
|
133
|
+
k_imag = tl.load(k_head_ptr + d_indices * 2 + 1, mask=mask_d, other=0.0)
|
134
|
+
|
135
|
+
new_k_real = tl.math.fma(k_real, freqs_real, -(k_imag * freqs_imag))
|
136
|
+
new_k_imag = tl.math.fma(k_real, freqs_imag, k_imag * freqs_real)
|
137
|
+
|
138
|
+
tl.store(k_head_ptr + d_indices * 2, new_k_real, mask=mask_d)
|
139
|
+
tl.store(k_head_ptr + d_indices * 2 + 1, new_k_imag, mask=mask_d)
|
140
|
+
|
141
|
+
|
142
|
+
def _select_kernel_meta(head_dim_half: int):
|
143
|
+
# Heuristic tuning for block size and num_warps
|
144
|
+
if head_dim_half >= 256:
|
145
|
+
return 128, 8
|
146
|
+
if head_dim_half >= 96:
|
147
|
+
return 128, 4
|
148
|
+
if head_dim_half >= 48:
|
149
|
+
return 64, 4
|
150
|
+
if head_dim_half >= 24:
|
151
|
+
return 32, 2
|
152
|
+
return 16, 2
|
153
|
+
|
154
|
+
|
155
|
+
def llama4_rope_forward(q, k, freqs_cis, BLOCK_SIZE: int = None, imag_sign: float = 1.0):
|
156
|
+
# Save original dtype for casting back
|
157
|
+
original_dtype = q.dtype
|
158
|
+
|
159
|
+
batch_size, seq_len, n_q_heads, head_dim = q.shape
|
160
|
+
_, _, n_k_heads, _ = k.shape
|
161
|
+
head_dim_half = head_dim // 2
|
162
|
+
|
163
|
+
# Prepare frequencies
|
164
|
+
freqs_real, freqs_imag = _prepare_freqs(freqs_cis, seq_len, head_dim_half)
|
165
|
+
|
166
|
+
# Cast to appropriate dtype and make contiguous only when needed
|
167
|
+
q, k, freqs_real, freqs_imag = _cast_and_contiguous(q, k, freqs_real, freqs_imag)
|
168
|
+
|
169
|
+
# H100-optimized meta-params
|
170
|
+
if BLOCK_SIZE is None:
|
171
|
+
BLOCK_SIZE, num_warps = _select_kernel_meta(head_dim_half)
|
172
|
+
else:
|
173
|
+
# Provide a default num_warps if caller pins BLOCK_SIZE
|
174
|
+
_, num_warps = _select_kernel_meta(head_dim_half)
|
175
|
+
|
176
|
+
# 2D grid: one program per (batch, seq, head)
|
177
|
+
n_heads_max = max(n_q_heads, n_k_heads)
|
178
|
+
grid = (batch_size * seq_len, n_heads_max)
|
179
|
+
|
180
|
+
# Launch kernel
|
181
|
+
_llama4_rope_kernel[grid](
|
182
|
+
q,
|
183
|
+
k,
|
184
|
+
freqs_real,
|
185
|
+
freqs_imag,
|
186
|
+
q.stride(1),
|
187
|
+
k.stride(1),
|
188
|
+
q.stride(2),
|
189
|
+
k.stride(2),
|
190
|
+
freqs_real.stride(0),
|
191
|
+
seq_len,
|
192
|
+
batch_size,
|
193
|
+
imag_sign,
|
194
|
+
head_dim_half,
|
195
|
+
n_q_heads,
|
196
|
+
n_k_heads,
|
197
|
+
BLOCK_SIZE,
|
198
|
+
num_warps=num_warps,
|
199
|
+
num_stages=2,
|
200
|
+
)
|
201
|
+
|
202
|
+
# Cast back to original dtype only if it differs from compute dtype
|
203
|
+
if q.dtype != original_dtype:
|
204
|
+
q = q.to(original_dtype)
|
205
|
+
if k.dtype != original_dtype:
|
206
|
+
k = k.to(original_dtype)
|
207
|
+
|
208
|
+
return q, k
|
209
|
+
|
210
|
+
|
211
|
+
class LigerLlama4RopeFunction(torch.autograd.Function):
|
212
|
+
@staticmethod
|
213
|
+
def forward(ctx, q, k, freqs_cis, BLOCK_SIZE: int = None):
|
214
|
+
q_out, k_out = llama4_rope_forward(q, k, freqs_cis, BLOCK_SIZE, imag_sign=1.0)
|
215
|
+
ctx.save_for_backward(freqs_cis.detach() if isinstance(freqs_cis, torch.Tensor) else freqs_cis)
|
216
|
+
ctx.BLOCK_SIZE = BLOCK_SIZE
|
217
|
+
return q_out, k_out
|
218
|
+
|
219
|
+
@staticmethod
|
220
|
+
def backward(ctx, dq, dk):
|
221
|
+
(freqs_cis,) = ctx.saved_tensors
|
222
|
+
BLOCK_SIZE = getattr(ctx, "BLOCK_SIZE", None)
|
223
|
+
# Use imag_sign=-1.0 for conjugate without materializing a new tensor
|
224
|
+
dq_out, dk_out = llama4_rope_forward(dq, dk, freqs_cis, BLOCK_SIZE, imag_sign=-1.0)
|
225
|
+
return dq_out, dk_out, None
|
@@ -11,6 +11,8 @@ from liger_kernel.transformers.fused_linear_jsd import LigerFusedLinearJSD # no
|
|
11
11
|
from liger_kernel.transformers.geglu import LigerGEGLUMLP # noqa: F401
|
12
12
|
from liger_kernel.transformers.jsd import LigerJSD # noqa: F401
|
13
13
|
from liger_kernel.transformers.layer_norm import LigerLayerNorm # noqa: F401
|
14
|
+
from liger_kernel.transformers.llama4_rope import liger_llama4_text_rotary_pos_emb # noqa: F401
|
15
|
+
from liger_kernel.transformers.llama4_rope import liger_llama4_vision_rotary_pos_emb # noqa: F401
|
14
16
|
from liger_kernel.transformers.rms_norm import LigerRMSNorm # noqa: F401
|
15
17
|
from liger_kernel.transformers.rope import liger_rotary_pos_emb # noqa: F401
|
16
18
|
from liger_kernel.transformers.swiglu import LigerBlockSparseTop2MLP # noqa: F401
|
@@ -125,6 +127,8 @@ __all__ = [
|
|
125
127
|
"LigerFusedAddRMSNorm",
|
126
128
|
"LigerRMSNorm",
|
127
129
|
"liger_rotary_pos_emb",
|
130
|
+
"liger_llama4_text_rotary_pos_emb",
|
131
|
+
"liger_llama4_vision_rotary_pos_emb",
|
128
132
|
"LigerBlockSparseTop2MLP",
|
129
133
|
"LigerPhi3SwiGLUMLP",
|
130
134
|
"LigerQwen3MoeSwiGLUMLP",
|