liger-kernel-nightly 0.6.2.dev20251011152316__tar.gz → 0.6.2.dev20251011154427__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/PKG-INFO +1 -1
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_cross_entropy.py +4 -1
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +25 -19
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/pyproject.toml +1 -1
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/cross_entropy.py +55 -52
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/fused_linear_cross_entropy.py +3 -2
- liger_kernel_nightly-0.6.2.dev20251011154427/src/liger_kernel/transformers/model/falcon_h1.py +108 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/monkey_patch.py +8 -4
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel_nightly.egg-info/PKG-INFO +1 -1
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel_nightly.egg-info/SOURCES.txt +1 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/convergence/bf16/test_mini_models.py +60 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/convergence/bf16/test_mini_models_with_logits.py +59 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/convergence/fp32/test_mini_models.py +56 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/convergence/fp32/test_mini_models_with_logits.py +56 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_cross_entropy.py +45 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_fused_linear_cross_entropy.py +113 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_monkey_patch.py +51 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/utils.py +12 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/.github/pull_request_template.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/.github/workflows/amd-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/.github/workflows/benchmark.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/.github/workflows/docs.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/.github/workflows/intel-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/.github/workflows/nvi-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/.github/workflows/publish-nightly.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/.github/workflows/publish-release.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/.gitignore +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/LICENSE +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/Makefile +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/NOTICE +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/benchmarks_visualizer.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/data/all_benchmark_data.csv +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/benchmark_tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/benchmark/scripts/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/dev/fmt-requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/dev/modal/benchmarks.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/dev/modal/tests.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/dev/modal/tests_bwd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/Examples.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/Getting-Started.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/High-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/Low-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/acknowledgement.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/contributing.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/images/banner.GIF +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/images/compose.gif +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/images/e2e-memory.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/images/e2e-tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/images/logo-banner.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/images/patch.gif +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/images/post-training.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/index.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/docs/license.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/alignment/accelerate_config.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/alignment/run_orpo.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/callback.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/config/fsdp_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/img/gemma_7b_mem.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/img/gemma_7b_tp.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/img/llama_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/img/llama_tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/img/qwen_tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/launch_on_modal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/run_benchmarks.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/run_gemma.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/run_llama.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/run_qwen.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/run_qwen2_vl.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/training.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/huggingface/training_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/lightning/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/lightning/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/lightning/training.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/callback.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/medusa_util.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/examples/medusa/train.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/licenses/LICENSE-Apache-2.0 +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/licenses/LICENSE-MIT-AutoAWQ +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/licenses/LICENSE-MIT-llmc +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/licenses/LICENSE-MIT-triton +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/mkdocs.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/setup.cfg +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/setup.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/functional.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/env_report.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/ops/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/auto_model.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/experimental/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/fsdp.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/functional.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/gemma.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/gemma2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/gemma3.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/glm4.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/glm4v.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/glm4v_moe.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/internvl.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/llama.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/llama4.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/llava.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/mistral.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/mixtral.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/mllama.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/olmo2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/paligemma.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/phi3.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/qwen2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/qwen3.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/model/smollm3.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/trainer_integration.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/transformers/tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/triton/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/triton/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/chunked_loss/test_cosine_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/chunked_loss/test_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/chunked_loss/test_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/chunked_loss/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/chunked_loss/test_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/chunked_loss/test_kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/chunked_loss/test_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/chunked_loss/test_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/conftest.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/convergence/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/convergence/bf16/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/convergence/bf16/test_mini_models_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/convergence/fp32/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/fake_configs/OpenGVLab/InternVL3-1B-hf/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/tiny_shakespeare.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_auto_model.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_flex_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_trainer_integration.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_transformers.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/transformers/test_tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251011152316 → liger_kernel_nightly-0.6.2.dev20251011154427}/test/triton/test_triton_monkey_patch.py +0 -0
@@ -70,6 +70,9 @@ def bench_speed_cross_entropy(
|
|
70
70
|
|
71
71
|
if mode == "forward":
|
72
72
|
ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, rep=100, quantiles=QUANTILES)
|
73
|
+
elif mode == "no-grad-forward":
|
74
|
+
with torch.no_grad():
|
75
|
+
ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, rep=100, quantiles=QUANTILES)
|
73
76
|
elif mode == "backward":
|
74
77
|
y = fwd()
|
75
78
|
|
@@ -109,7 +112,7 @@ if __name__ == "__main__":
|
|
109
112
|
|
110
113
|
run_benchmarks(
|
111
114
|
bench_test_fn=bench_speed_cross_entropy,
|
112
|
-
kernel_operation_modes=["forward", "backward", "full"],
|
115
|
+
kernel_operation_modes=["forward", "backward", "full", "no-grad-forward"],
|
113
116
|
metric_name="speed",
|
114
117
|
metric_unit="ms",
|
115
118
|
**common_configs,
|
@@ -59,26 +59,26 @@ def bench_memory_fused_linear_cross_entropy(
|
|
59
59
|
dtype = input.extra_benchmark_config["dtype"]
|
60
60
|
provider = input.kernel_provider
|
61
61
|
|
62
|
-
|
63
|
-
|
64
|
-
|
62
|
+
lm_head_ce = None
|
63
|
+
if provider == "liger":
|
64
|
+
lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype).to(device)
|
65
|
+
elif provider == "liger-fp32-accum":
|
66
|
+
lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype, accum_dtype=torch.float32).to(device)
|
67
|
+
else:
|
68
|
+
lm_head_ce = TorchLMHeadCE(H=H, V=V, dtype=dtype).to(device)
|
65
69
|
|
66
70
|
_input = torch.randn(BT, H, requires_grad=True, dtype=dtype, device=device)
|
67
71
|
target = torch.randint(V, (BT, 1), dtype=torch.long, device=device).squeeze(1)
|
68
72
|
|
69
73
|
def fwd():
|
70
|
-
|
71
|
-
return liger_lm_head_ce(_input, target)
|
72
|
-
elif provider == "liger-fp32-accum":
|
73
|
-
return liger_lm_head_ce_fp32_accum(_input, target)
|
74
|
-
elif provider == "huggingface":
|
75
|
-
return torch_lm_head_ce(_input, target)
|
74
|
+
return lm_head_ce(_input, target)
|
76
75
|
|
77
76
|
def full():
|
78
77
|
y = fwd()
|
79
78
|
y.backward()
|
80
79
|
|
81
80
|
mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
|
81
|
+
|
82
82
|
return SingleBenchmarkRunOutput(
|
83
83
|
y_20=mem_20,
|
84
84
|
y_50=mem_50,
|
@@ -101,20 +101,19 @@ def bench_speed_fused_linear_cross_entropy(
|
|
101
101
|
provider = input.kernel_provider
|
102
102
|
mode = input.kernel_operation_mode
|
103
103
|
|
104
|
-
|
105
|
-
|
106
|
-
|
104
|
+
lm_head_ce = None
|
105
|
+
if provider == "liger":
|
106
|
+
lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype).to(device)
|
107
|
+
elif provider == "liger-fp32-accum":
|
108
|
+
lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype, accum_dtype=torch.float32).to(device)
|
109
|
+
else:
|
110
|
+
lm_head_ce = TorchLMHeadCE(H=H, V=V, dtype=dtype).to(device)
|
107
111
|
|
108
112
|
_input = torch.randn(BT, H, requires_grad=True, dtype=dtype, device=device)
|
109
113
|
target = torch.randint(V, (BT, 1), dtype=torch.long, device=device).squeeze(1)
|
110
114
|
|
111
115
|
def fwd():
|
112
|
-
|
113
|
-
return liger_lm_head_ce(_input, target)
|
114
|
-
elif provider == "liger-fp32-accum":
|
115
|
-
return liger_lm_head_ce_fp32_accum(_input, target)
|
116
|
-
elif provider == "huggingface":
|
117
|
-
return torch_lm_head_ce(_input, target)
|
116
|
+
return lm_head_ce(_input, target)
|
118
117
|
|
119
118
|
if mode == "forward":
|
120
119
|
ms_50, ms_20, ms_80 = triton.testing.do_bench(
|
@@ -122,6 +121,13 @@ def bench_speed_fused_linear_cross_entropy(
|
|
122
121
|
rep=100,
|
123
122
|
quantiles=QUANTILES,
|
124
123
|
)
|
124
|
+
elif mode == "no-grad-forward":
|
125
|
+
with torch.no_grad():
|
126
|
+
ms_50, ms_20, ms_80 = triton.testing.do_bench(
|
127
|
+
fwd,
|
128
|
+
rep=100,
|
129
|
+
quantiles=QUANTILES,
|
130
|
+
)
|
125
131
|
elif mode == "backward":
|
126
132
|
y = fwd()
|
127
133
|
|
@@ -164,7 +170,7 @@ if __name__ == "__main__":
|
|
164
170
|
|
165
171
|
run_benchmarks(
|
166
172
|
bench_test_fn=bench_speed_fused_linear_cross_entropy,
|
167
|
-
kernel_operation_modes=["forward", "backward", "full"],
|
173
|
+
kernel_operation_modes=["forward", "backward", "full", "no-grad-forward"],
|
168
174
|
metric_name="speed",
|
169
175
|
metric_unit="ms",
|
170
176
|
**common_configs,
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "liger_kernel_nightly"
|
7
|
-
version = "0.6.2.
|
7
|
+
version = "0.6.2.dev20251011154427"
|
8
8
|
description = "Efficient Triton kernels for LLM Training"
|
9
9
|
urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
@@ -45,6 +45,7 @@ def liger_cross_entropy_kernel(
|
|
45
45
|
BLOCK_SIZE: tl.constexpr,
|
46
46
|
HAS_WEIGHT: tl.constexpr,
|
47
47
|
HAS_SOFTCAPPING: tl.constexpr,
|
48
|
+
HAS_GRADIENTS: tl.constexpr,
|
48
49
|
):
|
49
50
|
"""
|
50
51
|
This kernel computes both cross entropy loss and the gradient of the input.
|
@@ -72,6 +73,7 @@ def liger_cross_entropy_kernel(
|
|
72
73
|
BLOCK_SIZE (int): The block size for Triton operations.
|
73
74
|
HAS_WEIGHT (bool): The boolean value to determine whether assigning weight to each of the classes.
|
74
75
|
HAS_SOFTCAPPING (bool): The boolean value to determine whether applying soft-capping or not.
|
76
|
+
HAS_GRADIENTS (bool): The boolean value to determine whether calculating gradients in forward pass.
|
75
77
|
"""
|
76
78
|
|
77
79
|
# https://github.com/triton-lang/triton/issues/1058
|
@@ -155,58 +157,58 @@ def liger_cross_entropy_kernel(
|
|
155
157
|
# For 'sum' reduction, no normalization is applied:
|
156
158
|
# dx_y = softmax(x_y) - 1
|
157
159
|
# dx_i = softmax(x_i), for i ≠ y
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
160
|
+
if HAS_GRADIENTS:
|
161
|
+
for i in range(0, n_cols, BLOCK_SIZE):
|
162
|
+
X_offsets = i + tl.arange(0, BLOCK_SIZE)
|
163
|
+
X_block = tl.load(
|
164
|
+
X_ptr + X_offsets,
|
165
|
+
mask=X_offsets < n_cols,
|
166
|
+
other=float("-inf"),
|
167
|
+
# Ensure float32 precision for softmax calculation
|
168
|
+
).cast(tl.float32)
|
169
|
+
if HAS_SOFTCAPPING:
|
170
|
+
intermediate = tanh(X_block / softcap)
|
171
|
+
X_block = softcap * intermediate
|
172
|
+
|
173
|
+
if not HAS_WEIGHT:
|
174
|
+
# softmax(x_i)
|
175
|
+
X_block = tl.exp(X_block - m) / d
|
176
|
+
# derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i)
|
177
|
+
X_block += 2 * lse_square_scale * lse * X_block
|
178
|
+
# smoothing term
|
179
|
+
X_block += -eps
|
180
|
+
# special handle dx_y
|
181
|
+
X_block = tl.where(X_offsets != y, X_block, X_block - (1 - label_smoothing))
|
182
|
+
# reduction scale
|
183
|
+
if reduction == "mean":
|
184
|
+
X_block = X_block / n_non_ignore
|
185
|
+
else:
|
186
|
+
weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols)
|
187
|
+
softmax_X = tl.exp(X_block - m) / d
|
188
|
+
# derivative of original_loss
|
189
|
+
dloss_ori = (1 - label_smoothing) * softmax_X
|
190
|
+
# specially handle dx_y
|
191
|
+
dloss_ori = tl.where(X_offsets != y, dloss_ori, dloss_ori - (1 - label_smoothing))
|
192
|
+
dloss_ori = dloss_ori * weight_y
|
193
|
+
# derivative of smooth_loss
|
194
|
+
dloss_smooth = eps * (-weight_block + softmax_X * weight_sum)
|
195
|
+
# derivative of z-loss
|
196
|
+
dz_loss = 2 * lse_square_scale * lse * softmax_X
|
197
|
+
# reduction scale
|
198
|
+
if reduction == "mean":
|
199
|
+
dloss_ori = dloss_ori / sum_non_ignore_weight
|
200
|
+
dloss_smooth = dloss_smooth / sum_non_ignore_weight
|
201
|
+
# TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight.
|
202
|
+
dz_loss = dz_loss / n_non_ignore
|
203
|
+
# derivative of total_loss
|
204
|
+
X_block = dloss_ori + dloss_smooth + dz_loss
|
205
|
+
|
206
|
+
# chain rule softcapping
|
207
|
+
# d(softcap * tanh(x / softcap)) = (1 - tanh^2(x / softcap))
|
208
|
+
if HAS_SOFTCAPPING:
|
209
|
+
X_block = X_block * (1 - intermediate * intermediate)
|
210
|
+
|
211
|
+
tl.store(X_ptr + X_offsets, X_block, mask=X_offsets < n_cols)
|
210
212
|
|
211
213
|
# We need tl.debug_barrier() to ensure the new result of X_ptr is written as mentioned in
|
212
214
|
# https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/ops/cross_entropy.py#L34
|
@@ -332,6 +334,7 @@ def cross_entropy_forward(
|
|
332
334
|
BLOCK_SIZE=BLOCK_SIZE,
|
333
335
|
HAS_WEIGHT=True if weight is not None else False,
|
334
336
|
HAS_SOFTCAPPING=True if softcap is not None else False,
|
337
|
+
HAS_GRADIENTS=_input.requires_grad,
|
335
338
|
# TODO: 32 seems to give the best performance
|
336
339
|
# Performance is quite sensitive to num_warps
|
337
340
|
num_warps=32 if not is_hip() else 16,
|
@@ -150,6 +150,7 @@ def fused_linear_cross_entropy_forward(
|
|
150
150
|
RETURN_Z_LOSS=return_z_loss,
|
151
151
|
HAS_WEIGHT=True if ce_weight is not None else False,
|
152
152
|
HAS_SOFTCAPPING=True if softcap is not None else False,
|
153
|
+
HAS_GRADIENTS=_input.requires_grad,
|
153
154
|
BLOCK_SIZE=BLOCK_SIZE,
|
154
155
|
num_warps=32 if not is_hip() else 16,
|
155
156
|
)
|
@@ -173,10 +174,10 @@ def fused_linear_cross_entropy_forward(
|
|
173
174
|
|
174
175
|
grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
|
175
176
|
|
176
|
-
if grad_weight is not None:
|
177
|
+
if grad_weight is not None and _input.requires_grad:
|
177
178
|
grad_weight += torch.mm(grad_logits_chunk.t(), _input_chunk).float()
|
178
179
|
|
179
|
-
if bias is not None:
|
180
|
+
if bias is not None and _input.requires_grad:
|
180
181
|
torch.add(
|
181
182
|
input=grad_bias,
|
182
183
|
other=grad_logits_chunk.sum(dim=0),
|
@@ -0,0 +1,108 @@
|
|
1
|
+
from typing import TYPE_CHECKING
|
2
|
+
from typing import Optional
|
3
|
+
from typing import Union
|
4
|
+
|
5
|
+
import torch
|
6
|
+
|
7
|
+
from transformers.modeling_outputs import CausalLMOutputWithPast
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from transformers.models.falcon_h1.modeling_falcon_h1 import FalconHybridMambaAttentionDynamicCache
|
11
|
+
|
12
|
+
from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
|
13
|
+
|
14
|
+
|
15
|
+
def lce_forward(
|
16
|
+
self,
|
17
|
+
input_ids: torch.LongTensor = None,
|
18
|
+
attention_mask: Optional[torch.Tensor] = None,
|
19
|
+
position_ids: Optional[torch.LongTensor] = None,
|
20
|
+
past_key_values: Optional["FalconHybridMambaAttentionDynamicCache"] = None,
|
21
|
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
22
|
+
labels: Optional[torch.LongTensor] = None,
|
23
|
+
use_cache: Optional[bool] = None,
|
24
|
+
output_attentions: Optional[bool] = None,
|
25
|
+
output_hidden_states: Optional[bool] = None,
|
26
|
+
cache_position: Optional[torch.LongTensor] = None,
|
27
|
+
logits_to_keep: Union[int, torch.Tensor] = 0,
|
28
|
+
skip_logits: Optional[bool] = None,
|
29
|
+
**kwargs,
|
30
|
+
) -> Union[tuple, CausalLMOutputWithPast]:
|
31
|
+
r"""
|
32
|
+
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
33
|
+
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
34
|
+
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
35
|
+
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
36
|
+
|
37
|
+
Example:
|
38
|
+
|
39
|
+
```python
|
40
|
+
>>> from transformers import AutoTokenizer, FalconH1ForCausalLM
|
41
|
+
|
42
|
+
>>> model = FalconH1ForCausalLM.from_pretrained("...")
|
43
|
+
>>> tokenizer = AutoTokenizer.from_pretrained("...")
|
44
|
+
|
45
|
+
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
46
|
+
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
47
|
+
|
48
|
+
>>> # Generate
|
49
|
+
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
50
|
+
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
51
|
+
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
|
52
|
+
```"""
|
53
|
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
54
|
+
output_hidden_states = (
|
55
|
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
56
|
+
)
|
57
|
+
|
58
|
+
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
59
|
+
outputs = self.model(
|
60
|
+
input_ids=input_ids,
|
61
|
+
attention_mask=attention_mask,
|
62
|
+
position_ids=position_ids,
|
63
|
+
past_key_values=past_key_values,
|
64
|
+
inputs_embeds=inputs_embeds,
|
65
|
+
use_cache=use_cache,
|
66
|
+
output_attentions=output_attentions,
|
67
|
+
output_hidden_states=output_hidden_states,
|
68
|
+
cache_position=cache_position,
|
69
|
+
**kwargs,
|
70
|
+
)
|
71
|
+
|
72
|
+
hidden_states = outputs[0]
|
73
|
+
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
74
|
+
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
|
75
|
+
kept_hidden_states = hidden_states[:, slice_indices, :]
|
76
|
+
|
77
|
+
shift_labels = kwargs.pop("shift_labels", None)
|
78
|
+
logits = None
|
79
|
+
loss = None
|
80
|
+
# if in training mode, don't materialize logits
|
81
|
+
if skip_logits and labels is None:
|
82
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
83
|
+
|
84
|
+
if skip_logits is None:
|
85
|
+
# By default, if in training mode, don't materialize logits
|
86
|
+
skip_logits = self.training and labels is not None
|
87
|
+
|
88
|
+
if skip_logits:
|
89
|
+
loss = LigerForCausalLMLoss(
|
90
|
+
hidden_states=kept_hidden_states,
|
91
|
+
lm_head_weight=self.lm_head.weight,
|
92
|
+
labels=labels,
|
93
|
+
shift_labels=shift_labels,
|
94
|
+
hidden_size=self.config.hidden_size,
|
95
|
+
**kwargs,
|
96
|
+
)
|
97
|
+
else:
|
98
|
+
logits = self.lm_head(kept_hidden_states)
|
99
|
+
if labels is not None or shift_labels is not None:
|
100
|
+
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
|
101
|
+
|
102
|
+
return CausalLMOutputWithPast(
|
103
|
+
loss=loss,
|
104
|
+
logits=logits,
|
105
|
+
past_key_values=outputs.past_key_values,
|
106
|
+
hidden_states=outputs.hidden_states,
|
107
|
+
attentions=outputs.attentions,
|
108
|
+
)
|
@@ -15,6 +15,7 @@ from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
|
|
15
15
|
from liger_kernel.transformers.functional import liger_cross_entropy
|
16
16
|
from liger_kernel.transformers.geglu import LigerGEGLUMLP
|
17
17
|
from liger_kernel.transformers.layer_norm import LigerLayerNorm
|
18
|
+
from liger_kernel.transformers.model.falcon_h1 import lce_forward as falcon_h1_lce_forward
|
18
19
|
from liger_kernel.transformers.model.gemma import lce_forward as gemma_lce_forward
|
19
20
|
from liger_kernel.transformers.model.gemma import lce_forward_deprecated as gemma_lce_forward_deprecated
|
20
21
|
from liger_kernel.transformers.model.gemma2 import lce_forward as gemma2_lce_forward
|
@@ -2109,8 +2110,8 @@ def apply_liger_kernel_to_internvl(
|
|
2109
2110
|
|
2110
2111
|
def apply_liger_kernel_to_falcon_h1(
|
2111
2112
|
rope: bool = True,
|
2112
|
-
cross_entropy: bool =
|
2113
|
-
fused_linear_cross_entropy: bool =
|
2113
|
+
cross_entropy: bool = False,
|
2114
|
+
fused_linear_cross_entropy: bool = True,
|
2114
2115
|
rms_norm: bool = True,
|
2115
2116
|
swiglu: bool = False,
|
2116
2117
|
model: PreTrainedModel = None,
|
@@ -2144,7 +2145,7 @@ def apply_liger_kernel_to_falcon_h1(
|
|
2144
2145
|
logger.info("Apply liger RMSNorm")
|
2145
2146
|
modeling_falcon_h1.FalconH1RMSNorm = LigerRMSNorm
|
2146
2147
|
if swiglu:
|
2147
|
-
|
2148
|
+
logger.warning("LigerSwiGLUMLP is not available for Falcon-H1 models. There will be no effect.")
|
2148
2149
|
|
2149
2150
|
if cross_entropy:
|
2150
2151
|
logger.info("Apply liger cross entropy")
|
@@ -2153,7 +2154,10 @@ def apply_liger_kernel_to_falcon_h1(
|
|
2153
2154
|
nn.functional.cross_entropy = liger_cross_entropy
|
2154
2155
|
|
2155
2156
|
if fused_linear_cross_entropy:
|
2156
|
-
|
2157
|
+
if model is not None:
|
2158
|
+
model.forward = MethodType(falcon_h1_lce_forward, model)
|
2159
|
+
else:
|
2160
|
+
modeling_falcon_h1.FalconH1ForCausalLM.forward = falcon_h1_lce_forward
|
2157
2161
|
|
2158
2162
|
if model is not None:
|
2159
2163
|
# The model instance already exists, so we need to additionally patch the
|
@@ -187,6 +187,7 @@ src/liger_kernel/transformers/tvd.py
|
|
187
187
|
src/liger_kernel/transformers/experimental/__init__.py
|
188
188
|
src/liger_kernel/transformers/experimental/embedding.py
|
189
189
|
src/liger_kernel/transformers/model/__init__.py
|
190
|
+
src/liger_kernel/transformers/model/falcon_h1.py
|
190
191
|
src/liger_kernel/transformers/model/gemma.py
|
191
192
|
src/liger_kernel/transformers/model/gemma2.py
|
192
193
|
src/liger_kernel/transformers/model/gemma3.py
|
@@ -18,6 +18,7 @@ from transformers.models.phi3 import Phi3ForCausalLM
|
|
18
18
|
from transformers.models.qwen2 import Qwen2Config
|
19
19
|
from transformers.models.qwen2 import Qwen2ForCausalLM
|
20
20
|
|
21
|
+
from liger_kernel.transformers import apply_liger_kernel_to_falcon_h1
|
21
22
|
from liger_kernel.transformers import apply_liger_kernel_to_gemma
|
22
23
|
from liger_kernel.transformers import apply_liger_kernel_to_gemma2
|
23
24
|
from liger_kernel.transformers import apply_liger_kernel_to_gemma3_text
|
@@ -45,6 +46,7 @@ from test.utils import MiniModelConfig
|
|
45
46
|
from test.utils import assert_verbose_allclose
|
46
47
|
from test.utils import get_logprobs
|
47
48
|
from test.utils import get_topk
|
49
|
+
from test.utils import revert_liger_kernel_to_falcon_h1
|
48
50
|
from test.utils import revert_liger_kernel_to_gemma
|
49
51
|
from test.utils import revert_liger_kernel_to_gemma2
|
50
52
|
from test.utils import revert_liger_kernel_to_gemma3_text
|
@@ -201,6 +203,15 @@ try:
|
|
201
203
|
except ImportError:
|
202
204
|
INTERNVL_AVAILABLE = False
|
203
205
|
|
206
|
+
try:
|
207
|
+
# FalconH1 is only available in transformers>=4.53.0
|
208
|
+
from transformers.models.falcon_h1.configuration_falcon_h1 import FalconH1Config
|
209
|
+
from transformers.models.falcon_h1.modeling_falcon_h1 import FalconH1ForCausalLM
|
210
|
+
|
211
|
+
FALCONH1_AVAILABLE = True
|
212
|
+
except ImportError:
|
213
|
+
FALCONH1_AVAILABLE = False
|
214
|
+
|
204
215
|
from liger_kernel.utils import infer_device
|
205
216
|
|
206
217
|
device = infer_device()
|
@@ -1065,6 +1076,36 @@ if INTERNVL_AVAILABLE:
|
|
1065
1076
|
),
|
1066
1077
|
)
|
1067
1078
|
|
1079
|
+
if FALCONH1_AVAILABLE:
|
1080
|
+
MINI_MODEL_SETUPS["mini_falcon_h1"] = MiniModelConfig(
|
1081
|
+
liger_kernel_patch_func=apply_liger_kernel_to_falcon_h1,
|
1082
|
+
liger_kernel_patch_revert_func=revert_liger_kernel_to_falcon_h1,
|
1083
|
+
model_class=FalconH1ForCausalLM,
|
1084
|
+
mini_model_config=FalconH1Config(
|
1085
|
+
model_type="falcon_h1",
|
1086
|
+
vocab_size=32000,
|
1087
|
+
hidden_size=256, # 4096
|
1088
|
+
num_hidden_layers=4, # 24
|
1089
|
+
num_attention_heads=4, # 32
|
1090
|
+
num_key_value_heads=2, # 8
|
1091
|
+
intermediate_size=1024, # 11008
|
1092
|
+
hidden_act="silu",
|
1093
|
+
max_position_embeddings=4096,
|
1094
|
+
initializer_range=0.02,
|
1095
|
+
rms_norm_eps=1e-6,
|
1096
|
+
use_cache=True,
|
1097
|
+
pad_token_id=0,
|
1098
|
+
bos_token_id=1,
|
1099
|
+
eos_token_id=2,
|
1100
|
+
tie_word_embeddings=False,
|
1101
|
+
mamba_d_ssm=128, # 1024
|
1102
|
+
mamba_n_heads=16, # 128
|
1103
|
+
mamba_d_state=32, # 245
|
1104
|
+
mamba_d_conv=2, # 4
|
1105
|
+
attn_implementation="eager",
|
1106
|
+
),
|
1107
|
+
)
|
1108
|
+
|
1068
1109
|
|
1069
1110
|
def create_model(model_name="mini_llama4"):
|
1070
1111
|
"""
|
@@ -1574,6 +1615,25 @@ def run_mini_model(
|
|
1574
1615
|
),
|
1575
1616
|
],
|
1576
1617
|
),
|
1618
|
+
pytest.param(
|
1619
|
+
"mini_falcon_h1",
|
1620
|
+
32,
|
1621
|
+
1e-5,
|
1622
|
+
torch.bfloat16,
|
1623
|
+
1e-2,
|
1624
|
+
1e-2,
|
1625
|
+
1e-1,
|
1626
|
+
1e-2,
|
1627
|
+
1e-2,
|
1628
|
+
1e-2,
|
1629
|
+
marks=[
|
1630
|
+
pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
|
1631
|
+
pytest.mark.skipif(
|
1632
|
+
not FALCONH1_AVAILABLE,
|
1633
|
+
reason="FalconH1 not available in this version of transformers",
|
1634
|
+
),
|
1635
|
+
],
|
1636
|
+
),
|
1577
1637
|
],
|
1578
1638
|
)
|
1579
1639
|
def test_mini_model(
|
@@ -18,6 +18,7 @@ from transformers.models.phi3 import Phi3ForCausalLM
|
|
18
18
|
from transformers.models.qwen2 import Qwen2Config
|
19
19
|
from transformers.models.qwen2 import Qwen2ForCausalLM
|
20
20
|
|
21
|
+
from liger_kernel.transformers import apply_liger_kernel_to_falcon_h1
|
21
22
|
from liger_kernel.transformers import apply_liger_kernel_to_gemma
|
22
23
|
from liger_kernel.transformers import apply_liger_kernel_to_gemma2
|
23
24
|
from liger_kernel.transformers import apply_liger_kernel_to_gemma3_text
|
@@ -45,6 +46,7 @@ from test.utils import MiniModelConfig
|
|
45
46
|
from test.utils import assert_verbose_allclose
|
46
47
|
from test.utils import get_logprobs
|
47
48
|
from test.utils import get_topk
|
49
|
+
from test.utils import revert_liger_kernel_to_falcon_h1
|
48
50
|
from test.utils import revert_liger_kernel_to_gemma
|
49
51
|
from test.utils import revert_liger_kernel_to_gemma2
|
50
52
|
from test.utils import revert_liger_kernel_to_gemma3_text
|
@@ -201,6 +203,15 @@ try:
|
|
201
203
|
except ImportError:
|
202
204
|
INTERNVL_AVAILABLE = False
|
203
205
|
|
206
|
+
try:
|
207
|
+
# FalconH1 is only available in transformers>=4.53.0
|
208
|
+
from transformers.models.falcon_h1.configuration_falcon_h1 import FalconH1Config
|
209
|
+
from transformers.models.falcon_h1.modeling_falcon_h1 import FalconH1ForCausalLM
|
210
|
+
|
211
|
+
FALCONH1_AVAILABLE = True
|
212
|
+
except ImportError:
|
213
|
+
FALCONH1_AVAILABLE = False
|
214
|
+
|
204
215
|
from liger_kernel.utils import infer_device
|
205
216
|
|
206
217
|
device = infer_device()
|
@@ -1063,6 +1074,35 @@ if INTERNVL_AVAILABLE:
|
|
1063
1074
|
),
|
1064
1075
|
)
|
1065
1076
|
|
1077
|
+
if FALCONH1_AVAILABLE:
|
1078
|
+
MINI_MODEL_SETUPS["mini_falcon_h1"] = MiniModelConfig(
|
1079
|
+
liger_kernel_patch_func=apply_liger_kernel_to_falcon_h1,
|
1080
|
+
liger_kernel_patch_revert_func=revert_liger_kernel_to_falcon_h1,
|
1081
|
+
model_class=FalconH1ForCausalLM,
|
1082
|
+
mini_model_config=FalconH1Config(
|
1083
|
+
model_type="falcon_h1",
|
1084
|
+
vocab_size=32000,
|
1085
|
+
hidden_size=256, # 4096
|
1086
|
+
num_hidden_layers=4, # 24
|
1087
|
+
num_attention_heads=4, # 32
|
1088
|
+
num_key_value_heads=2, # 8
|
1089
|
+
intermediate_size=1024, # 11008
|
1090
|
+
hidden_act="silu",
|
1091
|
+
max_position_embeddings=4096,
|
1092
|
+
initializer_range=0.02,
|
1093
|
+
rms_norm_eps=1e-6,
|
1094
|
+
use_cache=True,
|
1095
|
+
pad_token_id=0,
|
1096
|
+
bos_token_id=1,
|
1097
|
+
eos_token_id=2,
|
1098
|
+
tie_word_embeddings=False,
|
1099
|
+
mamba_d_ssm=128, # 1024
|
1100
|
+
mamba_n_heads=16, # 128
|
1101
|
+
mamba_d_state=32, # 245
|
1102
|
+
mamba_d_conv=2, # 4
|
1103
|
+
),
|
1104
|
+
)
|
1105
|
+
|
1066
1106
|
|
1067
1107
|
def create_model(model_name="mini_llama3"):
|
1068
1108
|
"""
|
@@ -1547,6 +1587,25 @@ def run_mini_model(
|
|
1547
1587
|
),
|
1548
1588
|
],
|
1549
1589
|
),
|
1590
|
+
pytest.param(
|
1591
|
+
"mini_falcon_h1",
|
1592
|
+
32,
|
1593
|
+
1e-5,
|
1594
|
+
torch.bfloat16,
|
1595
|
+
1e-2,
|
1596
|
+
1e-2,
|
1597
|
+
1e-1,
|
1598
|
+
1e-2,
|
1599
|
+
1e-2,
|
1600
|
+
1e-2,
|
1601
|
+
marks=[
|
1602
|
+
pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
|
1603
|
+
pytest.mark.skipif(
|
1604
|
+
not FALCONH1_AVAILABLE,
|
1605
|
+
reason="FalconH1 not available in this version of transformers",
|
1606
|
+
),
|
1607
|
+
],
|
1608
|
+
),
|
1550
1609
|
],
|
1551
1610
|
)
|
1552
1611
|
def test_mini_model(
|