liger-kernel-nightly 0.5.10.dev20250528223223__tar.gz → 0.5.10.dev20250531184114__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/PKG-INFO +3 -1
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/README.md +2 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_softmax.py +3 -3
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/Low-Level-APIs.md +15 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/pyproject.toml +1 -1
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/rms_norm.py +243 -45
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/rms_norm.py +4 -1
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/softmax.py +1 -1
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel_nightly.egg-info/PKG-INFO +3 -1
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_softmax.py +2 -2
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/.github/pull_request_template.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/.github/workflows/amd-ci.yml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/.github/workflows/docs.yml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/.github/workflows/intel-ci.yml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/.github/workflows/nvi-ci.yml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/.github/workflows/publish-nightly.yml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/.github/workflows/publish-release.yml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/.gitignore +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/.idea/workspace.xml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/LICENSE +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/Makefile +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/NOTICE +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/README.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/benchmarks_visualizer.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/data/all_benchmark_data.csv +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_dyt.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_embedding.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_geglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_group_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_kl_div.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_kto_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_layer_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_rms_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_rope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_sparsemax.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_swiglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/benchmark_tvd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/benchmark/scripts/utils.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/dev/fmt-requirements.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/dev/modal/tests.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/dev/modal/tests_bwd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/Examples.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/Getting-Started.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/High-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/acknowledgement.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/contributing.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/images/banner.GIF +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/images/compose.gif +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/images/e2e-memory.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/images/e2e-tps.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/images/logo-banner.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/images/patch.gif +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/images/post-training.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/index.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/docs/license.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/alignment/accelerate_config.yaml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/alignment/run_orpo.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/README.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/callback.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/config/fsdp_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/img/gemma_7b_mem.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/img/gemma_7b_tp.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/img/llama_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/img/llama_tps.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/img/qwen_tps.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/launch_on_modal.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/requirements.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/run_benchmarks.sh +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/run_gemma.sh +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/run_llama.sh +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/run_qwen.sh +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/run_qwen2_vl.sh +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/training.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/huggingface/training_multimodal.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/lightning/README.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/lightning/requirements.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/lightning/training.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/README.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/callback.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/medusa_util.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/requirements.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/examples/medusa/train.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/licenses/LICENSE-Apache-2.0 +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/licenses/LICENSE-MIT-AutoAWQ +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/licenses/LICENSE-MIT-llmc +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/licenses/LICENSE-MIT-triton +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/mkdocs.yml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/setup.cfg +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/setup.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/chunked_loss/README.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/chunked_loss/functional.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/env_report.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/dyt.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/geglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/group_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/kl_div.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/layer_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/rope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/softmax.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/sparsemax.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/swiglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/tvd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/ops/utils.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/auto_model.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/dyt.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/fsdp.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/functional.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/geglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/gema3_rms.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/group_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/kl_div.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/layer_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/gemma.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/gemma2.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/gemma3.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/glm4.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/llama.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/llava.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/mistral.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/mixtral.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/mllama.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/olmo2.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/paligemma.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/phi3.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/qwen2.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/qwen3.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/rope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/sparsemax.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/swiglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/trainer_integration.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/transformers/tvd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/triton/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/triton/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel/utils.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel_nightly.egg-info/SOURCES.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/chunked_loss/test_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/chunked_loss/test_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/chunked_loss/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/chunked_loss/test_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/chunked_loss/test_kto_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/chunked_loss/test_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/chunked_loss/test_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/conftest.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/convergence/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/convergence/bf16/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/convergence/bf16/test_mini_models.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/convergence/bf16/test_mini_models_multimodal.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/convergence/bf16/test_mini_models_with_logits.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/convergence/fp32/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/convergence/fp32/test_mini_models.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/convergence/fp32/test_mini_models_with_logits.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/resources/tiny_shakespeare.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_auto_model.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_dyt.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_embedding.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_flex_attention.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_geglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_group_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_kl_div.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_layer_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_monkey_patch.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_rms_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_rope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_sparsemax.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_swiglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_trainer_integration.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_transformers.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/transformers/test_tvd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/triton/test_triton_monkey_patch.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250528223223 → liger_kernel_nightly-0.5.10.dev20250531184114}/test/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: liger_kernel_nightly
|
3
|
-
Version: 0.5.10.
|
3
|
+
Version: 0.5.10.dev20250531184114
|
4
4
|
Summary: Efficient Triton kernels for LLM Training
|
5
5
|
License: BSD 2-CLAUSE LICENSE
|
6
6
|
Copyright 2024 LinkedIn Corporation
|
@@ -325,6 +325,8 @@ loss.backward()
|
|
325
325
|
| GeGLU | `liger_kernel.transformers.LigerGEGLUMLP` |
|
326
326
|
| CrossEntropy | `liger_kernel.transformers.LigerCrossEntropyLoss` |
|
327
327
|
| Fused Linear CrossEntropy | `liger_kernel.transformers.LigerFusedLinearCrossEntropyLoss`|
|
328
|
+
| Multi Token Attention | `liger_kernel.transformers.LigerMultiTokenAttention` |
|
329
|
+
| Softmax | `liger_kernel.transformers.LigerSoftmax` |
|
328
330
|
| Sparsemax | `liger_kernel.transformers.LigerSparsemax` |
|
329
331
|
|
330
332
|
|
@@ -277,6 +277,8 @@ loss.backward()
|
|
277
277
|
| GeGLU | `liger_kernel.transformers.LigerGEGLUMLP` |
|
278
278
|
| CrossEntropy | `liger_kernel.transformers.LigerCrossEntropyLoss` |
|
279
279
|
| Fused Linear CrossEntropy | `liger_kernel.transformers.LigerFusedLinearCrossEntropyLoss`|
|
280
|
+
| Multi Token Attention | `liger_kernel.transformers.LigerMultiTokenAttention` |
|
281
|
+
| Softmax | `liger_kernel.transformers.LigerSoftmax` |
|
280
282
|
| Sparsemax | `liger_kernel.transformers.LigerSparsemax` |
|
281
283
|
|
282
284
|
|
@@ -8,7 +8,7 @@ from utils import _test_memory
|
|
8
8
|
from utils import parse_benchmark_script_args
|
9
9
|
from utils import run_benchmarks
|
10
10
|
|
11
|
-
from liger_kernel.transformers.softmax import
|
11
|
+
from liger_kernel.transformers.softmax import LigerSoftmax
|
12
12
|
from liger_kernel.utils import infer_device
|
13
13
|
|
14
14
|
device = infer_device()
|
@@ -23,7 +23,7 @@ def bench_speed_softmax(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOut
|
|
23
23
|
dtype = extra_benchmark_config["dtype"]
|
24
24
|
|
25
25
|
x_shape = (M, N)
|
26
|
-
liger_softmax =
|
26
|
+
liger_softmax = LigerSoftmax().to(device).to(dtype)
|
27
27
|
torch_softmax = torch.nn.Softmax(dim=-1).to(device).to(dtype)
|
28
28
|
|
29
29
|
x = torch.randn(x_shape, dtype=dtype, device=device)
|
@@ -72,7 +72,7 @@ def bench_memory_softmax(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOu
|
|
72
72
|
dtype = extra_benchmark_config.get("dtype", torch.float32)
|
73
73
|
|
74
74
|
torch_softmax = torch.nn.Softmax(dim=-1)
|
75
|
-
liger_softmax =
|
75
|
+
liger_softmax = LigerSoftmax().to(device).to(dtype)
|
76
76
|
|
77
77
|
x = torch.randn(shape, device=device, dtype=dtype, requires_grad=True)
|
78
78
|
|
@@ -9,6 +9,8 @@
|
|
9
9
|
| GeGLU | `liger_kernel.transformers.LigerGEGLUMLP` |
|
10
10
|
| CrossEntropy | `liger_kernel.transformers.LigerCrossEntropyLoss` |
|
11
11
|
| Fused Linear CrossEntropy | `liger_kernel.transformers.LigerFusedLinearCrossEntropyLoss`|
|
12
|
+
| Multi Token Attention | `liger_kernel.transformers.LigerMultiTokenAttention` |
|
13
|
+
| Softmax | `liger_kernel.transformers.LigerSoftmax` |
|
12
14
|
| Sparsemax | `liger_kernel.transformers.LigerSparsemax` |
|
13
15
|
|
14
16
|
|
@@ -51,6 +53,19 @@ This kernel combines linear transformations with cross-entropy loss calculations
|
|
51
53
|
!!! Example "Try it out"
|
52
54
|
You can experiment as shown in this example [here](https://colab.research.google.com/drive/1Z2QtvaIiLm5MWOs7X6ZPS1MN3hcIJFbj?usp=sharing)
|
53
55
|
|
56
|
+
### Multi Token Attention
|
57
|
+
|
58
|
+
The Multi Token Attention kernel implementation provides and optimized fused implementation of multi-token attention over the implemented Pytorch model baseline. This is a new attention mechanism that can operate on multiple Q and K inputs introduced by Meta Research.
|
59
|
+
|
60
|
+
Paper: https://arxiv.org/abs/2504.00927
|
61
|
+
|
62
|
+
### Softmax
|
63
|
+
|
64
|
+
The Softmax kernel implementation provides an optimized implementation of the softmax operation, which is a fundamental component in neural networks for converting raw scores into probability distributions.
|
65
|
+
|
66
|
+
The implementation shows notable speedups compared to the Softmax PyTorch implementation
|
67
|
+
|
68
|
+
|
54
69
|
### Sparsemax
|
55
70
|
|
56
71
|
Sparsemax is a sparse alternative to softmax that produces sparse probability distributions. This kernel implements an efficient version of the sparsemax operation that can be used as a drop-in replacement for softmax in attention mechanisms or classification tasks.
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "liger_kernel_nightly"
|
7
|
-
version = "0.5.10.
|
7
|
+
version = "0.5.10.dev20250531184114"
|
8
8
|
description = "Efficient Triton kernels for LLM Training"
|
9
9
|
urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
@@ -193,6 +193,153 @@ def _rms_norm_backward_kernel(
|
|
193
193
|
|
194
194
|
tl.store(dW_ptr + row_block_id * dW_row_stride + col_offsets, dW_row, mask=mask)
|
195
195
|
|
196
|
+
@triton.jit
|
197
|
+
def _block_rms_norm_forward_kernel(
|
198
|
+
Y_ptr,
|
199
|
+
Y_row_stride,
|
200
|
+
X_ptr,
|
201
|
+
X_row_stride,
|
202
|
+
W_ptr,
|
203
|
+
W_row_stride,
|
204
|
+
RSTD_ptr,
|
205
|
+
RSTD_row_stride,
|
206
|
+
n_rows,
|
207
|
+
n_cols,
|
208
|
+
eps,
|
209
|
+
offset,
|
210
|
+
casting_mode: tl.constexpr, # constexpr so the `if` blocks can be optimized out
|
211
|
+
BLOCK_SIZE: tl.constexpr,
|
212
|
+
BLOCK_ROW: tl.constexpr,
|
213
|
+
):
|
214
|
+
"""
|
215
|
+
y_i = (x_i / (RMS)) * (offset + wi), RMS = sqrt(sum(x_i^2) / N)
|
216
|
+
|
217
|
+
Reference:
|
218
|
+
1. https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
|
219
|
+
2. https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/unsloth/kernels/rms_layernorm.py#L22
|
220
|
+
3. https://arxiv.org/pdf/1910.07467
|
221
|
+
"""
|
222
|
+
|
223
|
+
row_idx = tl.program_id(0) * BLOCK_ROW + tl.arange(0, BLOCK_ROW)
|
224
|
+
col_offsets = tl.arange(0, BLOCK_SIZE)
|
225
|
+
row_mask = row_idx < n_rows
|
226
|
+
col_mask = col_offsets < n_cols
|
227
|
+
|
228
|
+
|
229
|
+
X_row = tl.load(X_ptr + row_idx[:, None] * X_row_stride + col_offsets[None, :], mask=row_mask[:, None] & col_mask[None, :] , other=0)
|
230
|
+
X_row_dtype = X_row.dtype
|
231
|
+
W_row = tl.load(W_ptr + col_offsets, mask=col_mask, other=0)
|
232
|
+
|
233
|
+
# On Llama, only rstd is computed on fp32
|
234
|
+
if casting_mode == _CASTING_MODE_LLAMA:
|
235
|
+
X_row = X_row.to(tl.float32)
|
236
|
+
|
237
|
+
# Gemma computes everything on fp32, and then casts back the output to the original dtype
|
238
|
+
if casting_mode == _CASTING_MODE_GEMMA:
|
239
|
+
W_row = W_row.to(tl.float32)
|
240
|
+
X_row = X_row.to(tl.float32)
|
241
|
+
|
242
|
+
if casting_mode == _CASTING_MODE_NONE:
|
243
|
+
eps = eps.to(X_row_dtype)
|
244
|
+
offset = offset.to(X_row_dtype)
|
245
|
+
|
246
|
+
mean_square = tl.sum(X_row * X_row, axis=1) / n_cols
|
247
|
+
rstd = rsqrt(mean_square + eps)
|
248
|
+
|
249
|
+
# We can save time by caching rms with minimal memory overhead
|
250
|
+
# because rms is much smaller compared to X_row, as rms is for each row.
|
251
|
+
# However, on the computation side, it can save 4 operations (*, sum, /, sqrt).
|
252
|
+
tl.store(RSTD_ptr + row_idx * RSTD_row_stride, rstd, row_mask)
|
253
|
+
|
254
|
+
X_row = X_row * rstd[:, None]
|
255
|
+
|
256
|
+
# On Llama, the multiplication with the weight is done on the original dtype
|
257
|
+
if casting_mode == _CASTING_MODE_LLAMA:
|
258
|
+
X_row = X_row.to(X_row_dtype)
|
259
|
+
|
260
|
+
Y_row = X_row * (offset + W_row)[None, :]
|
261
|
+
|
262
|
+
if casting_mode == _CASTING_MODE_GEMMA:
|
263
|
+
Y_row = Y_row.to(X_row_dtype)
|
264
|
+
|
265
|
+
tl.store(Y_ptr + row_idx[:, None] * Y_row_stride + col_offsets[None, :], Y_row, mask=row_mask[:, None] & col_mask[None, :])
|
266
|
+
|
267
|
+
@triton.jit
|
268
|
+
def _block_rms_norm_backward_kernel(
|
269
|
+
dY_ptr,
|
270
|
+
dY_row_stride,
|
271
|
+
dX_ptr,
|
272
|
+
dX_row_stride,
|
273
|
+
X_ptr,
|
274
|
+
X_row_stride,
|
275
|
+
X_dtype: tl.constexpr,
|
276
|
+
W_ptr,
|
277
|
+
W_row_stride,
|
278
|
+
RSTD_ptr,
|
279
|
+
RSTD_row_stride,
|
280
|
+
dW_ptr,
|
281
|
+
dW_row_stride,
|
282
|
+
n_rows,
|
283
|
+
n_cols,
|
284
|
+
offset,
|
285
|
+
rows_per_program: tl.constexpr,
|
286
|
+
casting_mode: tl.constexpr,
|
287
|
+
BLOCK_SIZE: tl.constexpr,
|
288
|
+
BLOCK_ROW: tl.constexpr,
|
289
|
+
):
|
290
|
+
"""
|
291
|
+
dx = (1 / RMS) * [dy * (w + offset - (1 / N) * (1 / RMS^2) * ((dy * (w + offset)) dot x) * x]. * means element-wise multiplication, whileas dot means dot product
|
292
|
+
dw = sum(dy * (x / RMS)). summation over BxT dimension
|
293
|
+
"""
|
294
|
+
|
295
|
+
pid = tl.program_id(0).cast(tl.int64)
|
296
|
+
NUM_SMS = tl.num_programs(0)
|
297
|
+
|
298
|
+
col_offsets = tl.arange(0, BLOCK_SIZE)
|
299
|
+
col_mask = col_offsets < n_cols
|
300
|
+
|
301
|
+
dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
|
302
|
+
|
303
|
+
W_row = tl.load(W_ptr + col_offsets, mask=col_mask, other=0.0)
|
304
|
+
W_row = W_row + offset
|
305
|
+
|
306
|
+
for start in range(pid * BLOCK_ROW, n_rows, NUM_SMS * BLOCK_ROW):
|
307
|
+
row_idx = start + tl.arange(0, BLOCK_ROW)
|
308
|
+
row_mask = row_idx < n_rows
|
309
|
+
dY_row = tl.load(dY_ptr + row_idx[:, None] * dY_row_stride + col_offsets[None, :], mask=row_mask[:, None] & col_mask[None, :], other=0.0)
|
310
|
+
X_row = tl.load(X_ptr + row_idx[:, None] * X_row_stride + col_offsets[None, :], mask=row_mask[:, None] & col_mask[None, :], other=0.0)
|
311
|
+
|
312
|
+
# Get cached rms
|
313
|
+
rstd_row = tl.load(RSTD_ptr + row_idx * RSTD_row_stride, row_mask)
|
314
|
+
|
315
|
+
X_row = X_row.to(tl.float32)
|
316
|
+
|
317
|
+
# Different bacward graphs for different casting modes
|
318
|
+
if casting_mode == _CASTING_MODE_LLAMA:
|
319
|
+
m = (dY_row * W_row[None, :]).to(tl.float32)
|
320
|
+
|
321
|
+
elif casting_mode == _CASTING_MODE_GEMMA:
|
322
|
+
dY_row = dY_row.to(tl.float32)
|
323
|
+
m = dY_row * W_row[None, :]
|
324
|
+
else:
|
325
|
+
m = dY_row * W_row[None, :]
|
326
|
+
|
327
|
+
dX_row = rstd_row[:, None] * m
|
328
|
+
|
329
|
+
dX_row += (rstd_row[:, None]) * (-(1 / n_cols) * (rstd_row * rstd_row * tl.sum(m * X_row, axis=1))[:, None] * X_row)
|
330
|
+
|
331
|
+
# calculate the gradient of W
|
332
|
+
if casting_mode == _CASTING_MODE_LLAMA:
|
333
|
+
dW_row += tl.sum(dY_row * (X_row * rstd_row[:, None]).to(X_dtype), 0)
|
334
|
+
else:
|
335
|
+
# here X_row is already in fp32 (see previous if block)
|
336
|
+
dW_row += tl.sum(dY_row * (X_row * rstd_row[:, None]), 0)
|
337
|
+
|
338
|
+
tl.store(dX_ptr + row_idx[:, None] * dX_row_stride + col_offsets[None, :], dX_row, mask=row_mask[:, None] & col_mask[None, :])
|
339
|
+
|
340
|
+
|
341
|
+
tl.store(dW_ptr + pid * dW_row_stride + col_offsets, dW_row, mask=col_mask)
|
342
|
+
|
196
343
|
|
197
344
|
_str_to_casting_mode = {
|
198
345
|
"llama": _CASTING_MODE_LLAMA.value,
|
@@ -201,7 +348,7 @@ _str_to_casting_mode = {
|
|
201
348
|
}
|
202
349
|
|
203
350
|
|
204
|
-
def rms_norm_forward(X, W, eps, offset, casting_mode):
|
351
|
+
def rms_norm_forward(X, W, eps, offset, casting_mode, row_mode):
|
205
352
|
if not isinstance(casting_mode, int):
|
206
353
|
assert casting_mode in _str_to_casting_mode, f"Invalid casting mode: {casting_mode}"
|
207
354
|
casting_mode = _str_to_casting_mode[casting_mode]
|
@@ -227,27 +374,49 @@ def rms_norm_forward(X, W, eps, offset, casting_mode):
|
|
227
374
|
kernel_args = {}
|
228
375
|
if X.device.type == "xpu":
|
229
376
|
kernel_args["grf_mode"] = "large"
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
377
|
+
if BLOCK_SIZE > 256 or n_rows < 4096 * 8 or row_mode:
|
378
|
+
_rms_norm_forward_kernel[(n_rows,)](
|
379
|
+
Y,
|
380
|
+
Y.stride(0),
|
381
|
+
X,
|
382
|
+
X.stride(0),
|
383
|
+
W,
|
384
|
+
W.stride(0),
|
385
|
+
RSTD,
|
386
|
+
RSTD.stride(0),
|
387
|
+
n_cols,
|
388
|
+
eps,
|
389
|
+
offset,
|
390
|
+
casting_mode,
|
391
|
+
BLOCK_SIZE=BLOCK_SIZE,
|
392
|
+
num_warps=num_warps,
|
393
|
+
**kernel_args, # XPU-specific optimization
|
394
|
+
)
|
395
|
+
else:
|
396
|
+
BLOCK_ROW = 16
|
397
|
+
kernel_args["BLOCK_ROW"] = BLOCK_ROW
|
398
|
+
_block_rms_norm_forward_kernel[(triton.cdiv(n_rows, BLOCK_ROW),)](
|
399
|
+
Y,
|
400
|
+
Y.stride(0),
|
401
|
+
X,
|
402
|
+
X.stride(0),
|
403
|
+
W,
|
404
|
+
W.stride(0),
|
405
|
+
RSTD,
|
406
|
+
RSTD.stride(0),
|
407
|
+
n_rows,
|
408
|
+
n_cols,
|
409
|
+
eps,
|
410
|
+
offset,
|
411
|
+
casting_mode,
|
412
|
+
BLOCK_SIZE=BLOCK_SIZE,
|
413
|
+
num_warps=num_warps,
|
414
|
+
**kernel_args, # XPU-specific optimization
|
415
|
+
)
|
247
416
|
return Y.view(*shape), X, RSTD, BLOCK_SIZE, num_warps, casting_mode
|
248
417
|
|
249
418
|
|
250
|
-
def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place):
|
419
|
+
def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place, row_mode):
|
251
420
|
shape = dY.shape
|
252
421
|
dim = shape[-1]
|
253
422
|
dY = dY.view(-1, dim)
|
@@ -277,29 +446,56 @@ def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warp
|
|
277
446
|
if X.device.type == "xpu":
|
278
447
|
kernel_args["grf_mode"] = "large"
|
279
448
|
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
449
|
+
if BLOCK_SIZE > 256 or n_rows < 4096 * 8 or row_mode:
|
450
|
+
_rms_norm_backward_kernel[grid](
|
451
|
+
dY,
|
452
|
+
dY.stride(0),
|
453
|
+
dX,
|
454
|
+
dX.stride(0),
|
455
|
+
X,
|
456
|
+
X.stride(0),
|
457
|
+
torch_to_triton_dtype[X.dtype],
|
458
|
+
W,
|
459
|
+
W.stride(0),
|
460
|
+
RSTD,
|
461
|
+
RSTD.stride(0),
|
462
|
+
_dW,
|
463
|
+
_dW.stride(0),
|
464
|
+
n_rows,
|
465
|
+
n_cols,
|
466
|
+
offset,
|
467
|
+
rows_per_program,
|
468
|
+
casting_mode,
|
469
|
+
BLOCK_SIZE=BLOCK_SIZE,
|
470
|
+
num_warps=num_warps,
|
471
|
+
**kernel_args, # XPU-specific optimization
|
472
|
+
)
|
473
|
+
else:
|
474
|
+
BLOCK_ROW = 16
|
475
|
+
kernel_args["BLOCK_ROW"] = BLOCK_ROW
|
476
|
+
_block_rms_norm_backward_kernel[grid](
|
477
|
+
dY,
|
478
|
+
dY.stride(0),
|
479
|
+
dX,
|
480
|
+
dX.stride(0),
|
481
|
+
X,
|
482
|
+
X.stride(0),
|
483
|
+
torch_to_triton_dtype[X.dtype],
|
484
|
+
W,
|
485
|
+
W.stride(0),
|
486
|
+
RSTD,
|
487
|
+
RSTD.stride(0),
|
488
|
+
_dW,
|
489
|
+
_dW.stride(0),
|
490
|
+
n_rows,
|
491
|
+
n_cols,
|
492
|
+
offset,
|
493
|
+
rows_per_program,
|
494
|
+
casting_mode,
|
495
|
+
BLOCK_SIZE=BLOCK_SIZE,
|
496
|
+
num_warps=num_warps,
|
497
|
+
**kernel_args, # XPU-specific optimization
|
498
|
+
)
|
303
499
|
dX = dX.view(*shape)
|
304
500
|
dW = _dW.sum(dim=0).to(W.dtype)
|
305
501
|
|
@@ -330,15 +526,16 @@ class LigerRMSNormFunction(torch.autograd.Function):
|
|
330
526
|
|
331
527
|
@staticmethod
|
332
528
|
@ensure_contiguous
|
333
|
-
def forward(ctx, X, W, eps, offset=0.0, casting_mode="llama", in_place=True):
|
529
|
+
def forward(ctx, X, W, eps, offset=0.0, casting_mode="llama", in_place=True, row_mode=None):
|
334
530
|
"""
|
335
531
|
X: (B, T, H) or (BxT, H)
|
336
532
|
W: (H,)
|
337
533
|
"""
|
338
|
-
Y, X, RSTD, BLOCK_SIZE, num_warps, casting_mode = rms_norm_forward(X, W, eps, offset, casting_mode)
|
534
|
+
Y, X, RSTD, BLOCK_SIZE, num_warps, casting_mode = rms_norm_forward(X, W, eps, offset, casting_mode, row_mode)
|
339
535
|
ctx.offset = offset
|
340
536
|
ctx.casting_mode = casting_mode
|
341
537
|
ctx.in_place = in_place
|
538
|
+
ctx.row_mode = row_mode
|
342
539
|
ctx.BLOCK_SIZE = BLOCK_SIZE
|
343
540
|
ctx.num_warps = num_warps
|
344
541
|
ctx.save_for_backward(X, W, RSTD)
|
@@ -361,5 +558,6 @@ class LigerRMSNormFunction(torch.autograd.Function):
|
|
361
558
|
ctx.BLOCK_SIZE,
|
362
559
|
ctx.num_warps,
|
363
560
|
ctx.in_place,
|
561
|
+
ctx.row_mode
|
364
562
|
)
|
365
|
-
return dX, dW, None, None, None, None
|
563
|
+
return dX, dW, None, None, None, None, None
|
@@ -13,6 +13,7 @@ class LigerRMSNorm(nn.Module):
|
|
13
13
|
casting_mode="llama",
|
14
14
|
init_fn="ones",
|
15
15
|
in_place=True,
|
16
|
+
row_mode=None,
|
16
17
|
):
|
17
18
|
super().__init__()
|
18
19
|
assert init_fn in [
|
@@ -20,11 +21,12 @@ class LigerRMSNorm(nn.Module):
|
|
20
21
|
"zeros",
|
21
22
|
], f"init_fn must be either 'ones' or 'zeros', got {init_fn}"
|
22
23
|
self.weight = nn.Parameter(torch.ones(hidden_size) if init_fn == "ones" else torch.zeros(hidden_size))
|
23
|
-
self.variance_epsilon, self.offset, self.casting_mode, self.in_place = (
|
24
|
+
self.variance_epsilon, self.offset, self.casting_mode, self.in_place, self.row_mode = (
|
24
25
|
eps,
|
25
26
|
offset,
|
26
27
|
casting_mode,
|
27
28
|
in_place,
|
29
|
+
row_mode,
|
28
30
|
)
|
29
31
|
|
30
32
|
def forward(self, hidden_states):
|
@@ -35,6 +37,7 @@ class LigerRMSNorm(nn.Module):
|
|
35
37
|
self.offset,
|
36
38
|
self.casting_mode,
|
37
39
|
self.in_place,
|
40
|
+
self.row_mode
|
38
41
|
)
|
39
42
|
|
40
43
|
def extra_repr(self):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: liger_kernel_nightly
|
3
|
-
Version: 0.5.10.
|
3
|
+
Version: 0.5.10.dev20250531184114
|
4
4
|
Summary: Efficient Triton kernels for LLM Training
|
5
5
|
License: BSD 2-CLAUSE LICENSE
|
6
6
|
Copyright 2024 LinkedIn Corporation
|
@@ -325,6 +325,8 @@ loss.backward()
|
|
325
325
|
| GeGLU | `liger_kernel.transformers.LigerGEGLUMLP` |
|
326
326
|
| CrossEntropy | `liger_kernel.transformers.LigerCrossEntropyLoss` |
|
327
327
|
| Fused Linear CrossEntropy | `liger_kernel.transformers.LigerFusedLinearCrossEntropyLoss`|
|
328
|
+
| Multi Token Attention | `liger_kernel.transformers.LigerMultiTokenAttention` |
|
329
|
+
| Softmax | `liger_kernel.transformers.LigerSoftmax` |
|
328
330
|
| Sparsemax | `liger_kernel.transformers.LigerSparsemax` |
|
329
331
|
|
330
332
|
|
@@ -6,7 +6,7 @@ from test.utils import set_seed
|
|
6
6
|
from test.utils import supports_bfloat16
|
7
7
|
|
8
8
|
from liger_kernel.transformers.functional import liger_softmax
|
9
|
-
from liger_kernel.transformers.softmax import
|
9
|
+
from liger_kernel.transformers.softmax import LigerSoftmax
|
10
10
|
from liger_kernel.utils import infer_device
|
11
11
|
|
12
12
|
device = infer_device()
|
@@ -47,7 +47,7 @@ def test_liger_softmax(shape, dtype, atol, rtol):
|
|
47
47
|
|
48
48
|
torch_softmax = torch.nn.Softmax(dim=-1)
|
49
49
|
ref_out = torch_softmax(x1)
|
50
|
-
liger_softmax =
|
50
|
+
liger_softmax = LigerSoftmax().to(device).to(dtype)
|
51
51
|
liger_out = liger_softmax(x2)
|
52
52
|
|
53
53
|
assert_verbose_allclose(ref_out, liger_out, atol=atol, rtol=rtol)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|