liger-kernel-nightly 0.5.10.dev20250522174514__tar.gz → 0.5.10.dev20250523162037__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.gitignore +3 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/Makefile +8 -2
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/PKG-INFO +1 -1
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/mkdocs.yml +2 -1
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/pyproject.toml +1 -1
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/gemma.py +11 -3
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/gemma2.py +11 -3
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/gemma3.py +14 -2
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/glm4.py +11 -3
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/llama.py +10 -2
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/llava.py +5 -1
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/mistral.py +8 -1
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/mixtral.py +11 -3
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/mllama.py +11 -3
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/olmo2.py +11 -3
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/paligemma.py +8 -1
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/phi3.py +11 -3
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/qwen2.py +11 -3
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/qwen2_5_vl.py +8 -1
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/qwen2_vl.py +8 -1
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/qwen3.py +11 -3
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/qwen3_moe.py +5 -2
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel_nightly.egg-info/PKG-INFO +1 -1
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/bf16/test_mini_models.py +31 -18
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/fp32/test_mini_models.py +29 -16
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/pull_request_template.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/workflows/amd-ci.yml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/workflows/docs.yml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/workflows/intel-ci.yml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/workflows/nvi-ci.yml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/workflows/publish-nightly.yml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.github/workflows/publish-release.yml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/.idea/workspace.xml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/LICENSE +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/NOTICE +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/README.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/README.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/benchmarks_visualizer.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/data/all_benchmark_data.csv +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_dyt.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_embedding.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_geglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_group_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_kl_div.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_kto_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_layer_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_rms_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_rope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_sparsemax.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_swiglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/benchmark_tvd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/benchmark/scripts/utils.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/dev/fmt-requirements.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/dev/modal/tests.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/dev/modal/tests_bwd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/Examples.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/Getting-Started.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/High-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/Low-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/acknowledgement.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/contributing.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/images/banner.GIF +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/images/compose.gif +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/images/e2e-memory.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/images/e2e-tps.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/images/logo-banner.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/images/patch.gif +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/images/post-training.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/index.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/docs/license.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/alignment/accelerate_config.yaml +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/alignment/run_orpo.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/README.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/callback.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/config/fsdp_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/img/gemma_7b_mem.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/img/gemma_7b_tp.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/img/llama_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/img/llama_tps.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/img/qwen_tps.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/launch_on_modal.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/requirements.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/run_benchmarks.sh +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/run_gemma.sh +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/run_llama.sh +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/run_qwen.sh +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/run_qwen2_vl.sh +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/training.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/huggingface/training_multimodal.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/lightning/README.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/lightning/requirements.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/lightning/training.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/README.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/callback.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/medusa_util.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/requirements.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/examples/medusa/train.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/licenses/LICENSE-Apache-2.0 +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/licenses/LICENSE-MIT-AutoAWQ +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/licenses/LICENSE-MIT-llmc +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/licenses/LICENSE-MIT-triton +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/setup.cfg +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/setup.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/README.md +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/functional.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/env_report.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/dyt.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/geglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/group_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/kl_div.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/layer_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/rms_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/rope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/sparsemax.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/swiglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/tvd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/ops/utils.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/auto_model.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/dyt.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/fsdp.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/functional.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/geglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/gema3_rms.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/group_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/kl_div.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/layer_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/rms_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/rope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/sparsemax.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/swiglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/trainer_integration.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/transformers/tvd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/triton/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/triton/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel/utils.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel_nightly.egg-info/SOURCES.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/test_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/test_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/test_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/test_kto_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/test_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/chunked_loss/test_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/conftest.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/bf16/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/bf16/test_mini_models_multimodal.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/bf16/test_mini_models_with_logits.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/fp32/__init__.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/convergence/fp32/test_mini_models_with_logits.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/tiny_shakespeare.txt +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_auto_model.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_dyt.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_embedding.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_flex_attention.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_geglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_group_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_jsd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_kl_div.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_layer_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_monkey_patch.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_rms_norm.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_rope.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_sparsemax.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_swiglu.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_trainer_integration.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_transformers.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/transformers/test_tvd.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/triton/test_triton_monkey_patch.py +0 -0
- {liger_kernel_nightly-0.5.10.dev20250522174514 → liger_kernel_nightly-0.5.10.dev20250523162037}/test/utils.py +0 -0
@@ -48,13 +48,19 @@ run-benchmarks:
|
|
48
48
|
# MkDocs Configuration
|
49
49
|
MKDOCS = mkdocs
|
50
50
|
CONFIG_FILE = mkdocs.yml
|
51
|
+
SITE_DIR = doc_site
|
51
52
|
|
52
53
|
# MkDocs targets
|
54
|
+
|
55
|
+
# Serve the documentation
|
53
56
|
serve:
|
54
57
|
$(MKDOCS) serve -f $(CONFIG_FILE)
|
55
58
|
|
59
|
+
# Build the documentation into the specified site directory
|
56
60
|
build:
|
57
|
-
$(MKDOCS) build -f $(CONFIG_FILE)
|
61
|
+
$(MKDOCS) build -f $(CONFIG_FILE) --site-dir $(SITE_DIR)
|
58
62
|
|
63
|
+
# Clean the output directory
|
59
64
|
clean:
|
60
|
-
rm -rf
|
65
|
+
rm -rf $(SITE_DIR)/
|
66
|
+
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "liger_kernel_nightly"
|
7
|
-
version = "0.5.10.
|
7
|
+
version = "0.5.10.dev20250523162037"
|
8
8
|
description = "Efficient Triton kernels for LLM Training"
|
9
9
|
urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
@@ -137,6 +137,7 @@ def lce_forward(
|
|
137
137
|
return_dict: Optional[bool] = None,
|
138
138
|
cache_position: Optional[torch.LongTensor] = None,
|
139
139
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
140
|
+
skip_logits: Optional[bool] = None,
|
140
141
|
**loss_kwargs,
|
141
142
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
142
143
|
r"""
|
@@ -199,8 +200,15 @@ def lce_forward(
|
|
199
200
|
shift_labels = loss_kwargs.pop("shift_labels", None)
|
200
201
|
logits = None
|
201
202
|
loss = None
|
202
|
-
|
203
|
-
if
|
203
|
+
|
204
|
+
if skip_logits and labels is None and shift_labels is None:
|
205
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
206
|
+
|
207
|
+
if skip_logits is None:
|
208
|
+
# By default, if in training mode, don't materialize logits
|
209
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
210
|
+
|
211
|
+
if skip_logits:
|
204
212
|
loss = LigerForCausalLMLoss(
|
205
213
|
hidden_states=kept_hidden_states,
|
206
214
|
lm_head_weight=self.lm_head.weight,
|
@@ -209,7 +217,7 @@ def lce_forward(
|
|
209
217
|
hidden_size=self.config.hidden_size,
|
210
218
|
**loss_kwargs,
|
211
219
|
)
|
212
|
-
else:
|
220
|
+
else:
|
213
221
|
logits = self.lm_head(kept_hidden_states)
|
214
222
|
if labels is not None:
|
215
223
|
loss = self.loss_function(
|
@@ -146,6 +146,7 @@ def lce_forward(
|
|
146
146
|
return_dict: Optional[bool] = None,
|
147
147
|
cache_position: Optional[torch.LongTensor] = None,
|
148
148
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
149
|
+
skip_logits: Optional[bool] = None,
|
149
150
|
**loss_kwargs,
|
150
151
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
151
152
|
r"""
|
@@ -213,8 +214,15 @@ def lce_forward(
|
|
213
214
|
shift_labels = loss_kwargs.pop("shift_labels", None)
|
214
215
|
logits = None
|
215
216
|
loss = None
|
216
|
-
|
217
|
-
if
|
217
|
+
|
218
|
+
if skip_logits and labels is None and shift_labels is None:
|
219
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
220
|
+
|
221
|
+
if skip_logits is None:
|
222
|
+
# By default, if in training mode, don't materialize logits
|
223
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
224
|
+
|
225
|
+
if skip_logits:
|
218
226
|
loss = LigerForCausalLMLoss(
|
219
227
|
hidden_states=kept_hidden_states,
|
220
228
|
lm_head_weight=self.lm_head.weight,
|
@@ -225,7 +233,7 @@ def lce_forward(
|
|
225
233
|
**loss_kwargs,
|
226
234
|
)
|
227
235
|
|
228
|
-
else:
|
236
|
+
else:
|
229
237
|
logits = self.lm_head(kept_hidden_states)
|
230
238
|
if self.config.final_logit_softcapping is not None:
|
231
239
|
logits = logits / self.config.final_logit_softcapping
|
@@ -35,6 +35,7 @@ def causal_forward(
|
|
35
35
|
return_dict: Optional[bool] = None,
|
36
36
|
cache_position: Optional[torch.LongTensor] = None,
|
37
37
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
38
|
+
skip_logits: Optional[bool] = None,
|
38
39
|
**loss_kwargs,
|
39
40
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
40
41
|
r"""
|
@@ -101,7 +102,11 @@ def causal_forward(
|
|
101
102
|
shift_labels = loss_kwargs.pop("shift_labels", None)
|
102
103
|
loss = None
|
103
104
|
logits = None
|
104
|
-
|
105
|
+
|
106
|
+
if skip_logits is None:
|
107
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
108
|
+
|
109
|
+
if skip_logits:
|
105
110
|
loss = LigerForCausalLMLoss(
|
106
111
|
hidden_states=kept_hidden_states,
|
107
112
|
lm_head_weight=self.lm_head.weight,
|
@@ -151,6 +156,7 @@ def multimodal_forward(
|
|
151
156
|
output_hidden_states: Optional[bool] = None,
|
152
157
|
return_dict: Optional[bool] = None,
|
153
158
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
159
|
+
skip_logits: Optional[bool] = None,
|
154
160
|
**lm_kwargs,
|
155
161
|
) -> Union[Tuple, Gemma3CausalLMOutputWithPast]:
|
156
162
|
r"""
|
@@ -272,7 +278,13 @@ def multimodal_forward(
|
|
272
278
|
loss = None
|
273
279
|
logits = None
|
274
280
|
|
275
|
-
if
|
281
|
+
if skip_logits and labels is None:
|
282
|
+
raise ValueError("skip_logits is True, but labels is None")
|
283
|
+
|
284
|
+
if skip_logits is None:
|
285
|
+
skip_logits = self.training and (labels is not None)
|
286
|
+
|
287
|
+
if skip_logits:
|
276
288
|
shift_hidden_states = hidden_states[..., :-1, :]
|
277
289
|
shift_labels = labels[..., 1:]
|
278
290
|
|
@@ -26,6 +26,7 @@ def lce_forward(
|
|
26
26
|
return_dict: Optional[bool] = None,
|
27
27
|
cache_position: Optional[torch.LongTensor] = None,
|
28
28
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
29
|
+
skip_logits: Optional[bool] = None,
|
29
30
|
**loss_kwargs,
|
30
31
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
31
32
|
r"""
|
@@ -89,8 +90,15 @@ def lce_forward(
|
|
89
90
|
shift_labels = loss_kwargs.pop("shift_labels", None)
|
90
91
|
logits = None
|
91
92
|
loss = None
|
92
|
-
|
93
|
-
if
|
93
|
+
|
94
|
+
if skip_logits and labels is None and shift_labels is None:
|
95
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
96
|
+
|
97
|
+
if skip_logits is None:
|
98
|
+
# By default, if in training mode, don't materialize logits
|
99
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
100
|
+
|
101
|
+
if skip_logits:
|
94
102
|
loss = LigerForCausalLMLoss(
|
95
103
|
hidden_states=kept_hidden_states,
|
96
104
|
lm_head_weight=self.lm_head.weight,
|
@@ -100,7 +108,7 @@ def lce_forward(
|
|
100
108
|
**loss_kwargs,
|
101
109
|
)
|
102
110
|
|
103
|
-
else:
|
111
|
+
else:
|
104
112
|
logits = self.lm_head(kept_hidden_states)
|
105
113
|
if labels is not None:
|
106
114
|
loss = self.loss_function(
|
@@ -151,6 +151,7 @@ def lce_forward(
|
|
151
151
|
return_dict: Optional[bool] = None,
|
152
152
|
cache_position: Optional[torch.LongTensor] = None,
|
153
153
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
154
|
+
skip_logits: Optional[bool] = None,
|
154
155
|
**loss_kwargs,
|
155
156
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
156
157
|
r"""
|
@@ -218,7 +219,14 @@ def lce_forward(
|
|
218
219
|
logits = None
|
219
220
|
loss = None
|
220
221
|
# if in training mode, don't materialize logits
|
221
|
-
if
|
222
|
+
if skip_logits and labels is None and shift_labels is None:
|
223
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
224
|
+
|
225
|
+
if skip_logits is None:
|
226
|
+
# By default, if in training mode, don't materialize logits
|
227
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
228
|
+
|
229
|
+
if skip_logits:
|
222
230
|
loss = lce_maybe_trainable_lm_head(
|
223
231
|
self,
|
224
232
|
hidden_states=kept_hidden_states,
|
@@ -228,7 +236,7 @@ def lce_forward(
|
|
228
236
|
**loss_kwargs,
|
229
237
|
)
|
230
238
|
|
231
|
-
else:
|
239
|
+
else:
|
232
240
|
logits = self.lm_head(kept_hidden_states)
|
233
241
|
if labels is not None:
|
234
242
|
loss = self.loss_function(
|
@@ -223,6 +223,7 @@ def lce_forward(
|
|
223
223
|
cache_position: Optional[torch.LongTensor] = None,
|
224
224
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
225
225
|
image_sizes: torch.Tensor = None,
|
226
|
+
skip_logits: Optional[bool] = None,
|
226
227
|
**lm_kwargs,
|
227
228
|
) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
|
228
229
|
r"""
|
@@ -325,7 +326,10 @@ def lce_forward(
|
|
325
326
|
loss = None
|
326
327
|
logits = None
|
327
328
|
|
328
|
-
|
329
|
+
# Overwrite skip_logits, since llava never materializes logits
|
330
|
+
skip_logits = labels is not None
|
331
|
+
|
332
|
+
if skip_logits:
|
329
333
|
# Shift so that tokens < n predict n
|
330
334
|
if attention_mask is not None:
|
331
335
|
# we use the input attention mask to shift the logits and labels, because it is 2D.
|
@@ -27,6 +27,7 @@ def lce_forward(
|
|
27
27
|
return_dict: Optional[bool] = None,
|
28
28
|
cache_position: Optional[torch.LongTensor] = None,
|
29
29
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
30
|
+
skip_logits: Optional[bool] = None,
|
30
31
|
**loss_kwargs,
|
31
32
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
32
33
|
r"""
|
@@ -93,7 +94,13 @@ def lce_forward(
|
|
93
94
|
loss = None
|
94
95
|
logits = None
|
95
96
|
|
96
|
-
if
|
97
|
+
if skip_logits and labels is None and shift_labels is None:
|
98
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
99
|
+
|
100
|
+
if skip_logits is None:
|
101
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
102
|
+
|
103
|
+
if skip_logits:
|
97
104
|
loss = LigerForCausalLMLoss(
|
98
105
|
hidden_states=kept_hidden_states,
|
99
106
|
lm_head_weight=self.lm_head.weight,
|
@@ -156,6 +156,7 @@ def lce_forward(
|
|
156
156
|
return_dict: Optional[bool] = None,
|
157
157
|
cache_position: Optional[torch.LongTensor] = None,
|
158
158
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
159
|
+
skip_logits: Optional[bool] = None,
|
159
160
|
**loss_kwargs,
|
160
161
|
) -> Union[Tuple, MoeCausalLMOutputWithPast]:
|
161
162
|
r"""
|
@@ -224,8 +225,15 @@ def lce_forward(
|
|
224
225
|
shift_labels = loss_kwargs.pop("shift_labels", None)
|
225
226
|
logits = None
|
226
227
|
loss = None
|
227
|
-
|
228
|
-
if
|
228
|
+
|
229
|
+
if skip_logits and labels is None and shift_labels is None:
|
230
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
231
|
+
|
232
|
+
if skip_logits is None:
|
233
|
+
# By default, if in training mode, don't materialize logits
|
234
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
235
|
+
|
236
|
+
if skip_logits:
|
229
237
|
loss = LigerForCausalLMLoss(
|
230
238
|
hidden_states=kept_hidden_states,
|
231
239
|
lm_head_weight=self.lm_head.weight,
|
@@ -235,7 +243,7 @@ def lce_forward(
|
|
235
243
|
**loss_kwargs,
|
236
244
|
)
|
237
245
|
|
238
|
-
else:
|
246
|
+
else:
|
239
247
|
logits = self.lm_head(kept_hidden_states)
|
240
248
|
|
241
249
|
loss = None
|
@@ -147,6 +147,7 @@ def lce_forward(
|
|
147
147
|
return_dict: Optional[bool] = None,
|
148
148
|
cache_position: Optional[torch.LongTensor] = None,
|
149
149
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
150
|
+
skip_logits: Optional[bool] = None,
|
150
151
|
**loss_kwargs,
|
151
152
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
152
153
|
r"""
|
@@ -215,8 +216,15 @@ def lce_forward(
|
|
215
216
|
shift_labels = loss_kwargs.pop("shift_labels", None)
|
216
217
|
logits = None
|
217
218
|
loss = None
|
218
|
-
|
219
|
-
if
|
219
|
+
|
220
|
+
if skip_logits and labels is None and shift_labels is None:
|
221
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
222
|
+
|
223
|
+
if skip_logits is None:
|
224
|
+
# By default, if in training mode, don't materialize logits
|
225
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
226
|
+
|
227
|
+
if skip_logits:
|
220
228
|
loss = LigerForCausalLMLoss(
|
221
229
|
hidden_states=kept_hidden_states,
|
222
230
|
lm_head_weight=self.lm_head.weight,
|
@@ -226,7 +234,7 @@ def lce_forward(
|
|
226
234
|
**loss_kwargs,
|
227
235
|
)
|
228
236
|
|
229
|
-
else:
|
237
|
+
else:
|
230
238
|
logits = self.lm_head(kept_hidden_states)
|
231
239
|
if labels is not None:
|
232
240
|
loss = self.loss_function(
|
@@ -26,6 +26,7 @@ def lce_forward(
|
|
26
26
|
return_dict: Optional[bool] = None,
|
27
27
|
cache_position: Optional[torch.LongTensor] = None,
|
28
28
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
29
|
+
skip_logits: Optional[bool] = None,
|
29
30
|
**loss_kwargs,
|
30
31
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
31
32
|
r"""
|
@@ -89,8 +90,15 @@ def lce_forward(
|
|
89
90
|
shift_labels = loss_kwargs.pop("shift_labels", None)
|
90
91
|
logits = None
|
91
92
|
loss = None
|
92
|
-
|
93
|
-
if
|
93
|
+
|
94
|
+
if skip_logits and labels is None and shift_labels is None:
|
95
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
96
|
+
|
97
|
+
if skip_logits is None:
|
98
|
+
# By default, if in training mode, don't materialize logits
|
99
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
100
|
+
|
101
|
+
if skip_logits:
|
94
102
|
loss = LigerForCausalLMLoss(
|
95
103
|
hidden_states=kept_hidden_states,
|
96
104
|
lm_head_weight=self.lm_head.weight,
|
@@ -100,7 +108,7 @@ def lce_forward(
|
|
100
108
|
**loss_kwargs,
|
101
109
|
)
|
102
110
|
|
103
|
-
else:
|
111
|
+
else:
|
104
112
|
logits = self.lm_head(kept_hidden_states)
|
105
113
|
if labels is not None:
|
106
114
|
loss = self.loss_function(
|
@@ -216,6 +216,7 @@ def lce_forward(
|
|
216
216
|
output_hidden_states: Optional[bool] = None,
|
217
217
|
return_dict: Optional[bool] = None,
|
218
218
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
219
|
+
skip_logits: Optional[bool] = None,
|
219
220
|
**lm_kwargs,
|
220
221
|
) -> Union[Tuple, PaliGemmaCausalLMOutputWithPast]:
|
221
222
|
r"""
|
@@ -331,7 +332,13 @@ def lce_forward(
|
|
331
332
|
loss = None
|
332
333
|
logits = None
|
333
334
|
|
334
|
-
if
|
335
|
+
if skip_logits and labels is None:
|
336
|
+
raise ValueError("skip_logits is True, but labels is None")
|
337
|
+
|
338
|
+
if skip_logits is None:
|
339
|
+
skip_logits = self.training and (labels is not None)
|
340
|
+
|
341
|
+
if skip_logits:
|
335
342
|
shift_hidden_states = hidden_states[..., :-1, :]
|
336
343
|
shift_labels = labels[..., 1:]
|
337
344
|
|
@@ -136,6 +136,7 @@ def lce_forward(
|
|
136
136
|
return_dict: Optional[bool] = None,
|
137
137
|
cache_position: Optional[torch.LongTensor] = None,
|
138
138
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
139
|
+
skip_logits: Optional[bool] = None,
|
139
140
|
**loss_kwargs,
|
140
141
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
141
142
|
r"""
|
@@ -212,8 +213,15 @@ def lce_forward(
|
|
212
213
|
shift_labels = loss_kwargs.pop("shift_labels", None)
|
213
214
|
logits = None
|
214
215
|
loss = None
|
215
|
-
|
216
|
-
if
|
216
|
+
|
217
|
+
if skip_logits and labels is None and shift_labels is None:
|
218
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
219
|
+
|
220
|
+
if skip_logits is None:
|
221
|
+
# By default, if in training mode, don't materialize logits
|
222
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
223
|
+
|
224
|
+
if skip_logits:
|
217
225
|
loss = LigerForCausalLMLoss(
|
218
226
|
hidden_states=kept_hidden_states,
|
219
227
|
lm_head_weight=self.lm_head.weight,
|
@@ -223,7 +231,7 @@ def lce_forward(
|
|
223
231
|
**loss_kwargs,
|
224
232
|
)
|
225
233
|
|
226
|
-
else:
|
234
|
+
else:
|
227
235
|
logits = self.lm_head(kept_hidden_states)
|
228
236
|
if labels is not None:
|
229
237
|
loss = self.loss_function(
|
@@ -135,6 +135,7 @@ def lce_forward(
|
|
135
135
|
return_dict: Optional[bool] = None,
|
136
136
|
cache_position: Optional[torch.LongTensor] = None,
|
137
137
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
138
|
+
skip_logits: Optional[bool] = None,
|
138
139
|
**loss_kwargs,
|
139
140
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
140
141
|
r"""
|
@@ -198,8 +199,15 @@ def lce_forward(
|
|
198
199
|
shift_labels = loss_kwargs.pop("shift_labels", None)
|
199
200
|
logits = None
|
200
201
|
loss = None
|
201
|
-
|
202
|
-
if
|
202
|
+
|
203
|
+
if skip_logits and labels is None and shift_labels is None:
|
204
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
205
|
+
|
206
|
+
if skip_logits is None:
|
207
|
+
# By default, if in training mode, don't materialize logits
|
208
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
209
|
+
|
210
|
+
if skip_logits:
|
203
211
|
loss = LigerForCausalLMLoss(
|
204
212
|
hidden_states=kept_hidden_states,
|
205
213
|
lm_head_weight=self.lm_head.weight,
|
@@ -209,7 +217,7 @@ def lce_forward(
|
|
209
217
|
**loss_kwargs,
|
210
218
|
)
|
211
219
|
|
212
|
-
else:
|
220
|
+
else:
|
213
221
|
logits = self.lm_head(kept_hidden_states)
|
214
222
|
if labels is not None:
|
215
223
|
loss = self.loss_function(
|
@@ -30,6 +30,7 @@ def lce_forward(
|
|
30
30
|
rope_deltas: Optional[torch.LongTensor] = None,
|
31
31
|
cache_position: Optional[torch.LongTensor] = None,
|
32
32
|
second_per_grid_ts: Optional[torch.Tensor] = None,
|
33
|
+
skip_logits: Optional[bool] = None,
|
33
34
|
**loss_kwargs,
|
34
35
|
) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
|
35
36
|
r"""
|
@@ -161,7 +162,13 @@ def lce_forward(
|
|
161
162
|
loss = None
|
162
163
|
logits = None
|
163
164
|
|
164
|
-
if
|
165
|
+
if skip_logits and labels is None and shift_labels is None:
|
166
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
167
|
+
|
168
|
+
if skip_logits is None:
|
169
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
170
|
+
|
171
|
+
if skip_logits:
|
165
172
|
loss = LigerForCausalLMLoss(
|
166
173
|
hidden_states=hidden_states,
|
167
174
|
lm_head_weight=self.lm_head.weight,
|
@@ -31,6 +31,7 @@ def lce_forward(
|
|
31
31
|
video_grid_thw: Optional[torch.LongTensor] = None,
|
32
32
|
rope_deltas: Optional[torch.LongTensor] = None,
|
33
33
|
cache_position: Optional[torch.LongTensor] = None,
|
34
|
+
skip_logits: Optional[bool] = None,
|
34
35
|
**loss_kwargs,
|
35
36
|
) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
|
36
37
|
r"""
|
@@ -165,7 +166,13 @@ def lce_forward(
|
|
165
166
|
loss = None
|
166
167
|
logits = None
|
167
168
|
|
168
|
-
if
|
169
|
+
if skip_logits and labels is None and shift_labels is None:
|
170
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
171
|
+
|
172
|
+
if skip_logits is None:
|
173
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
174
|
+
|
175
|
+
if skip_logits:
|
169
176
|
loss = LigerForCausalLMLoss(
|
170
177
|
hidden_states=hidden_states,
|
171
178
|
lm_head_weight=self.lm_head.weight,
|
@@ -22,6 +22,7 @@ def lce_forward(
|
|
22
22
|
output_hidden_states: Optional[bool] = None,
|
23
23
|
cache_position: Optional[torch.LongTensor] = None,
|
24
24
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
25
|
+
skip_logits: Optional[bool] = None,
|
25
26
|
**kwargs,
|
26
27
|
) -> CausalLMOutputWithPast:
|
27
28
|
r"""
|
@@ -82,8 +83,15 @@ def lce_forward(
|
|
82
83
|
shift_labels = kwargs.pop("shift_labels", None)
|
83
84
|
logits = None
|
84
85
|
loss = None
|
85
|
-
|
86
|
-
if
|
86
|
+
|
87
|
+
if skip_logits and labels is None and shift_labels is None:
|
88
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
89
|
+
|
90
|
+
if skip_logits is None:
|
91
|
+
# By default, if in training mode, don't materialize logits
|
92
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
93
|
+
|
94
|
+
if skip_logits:
|
87
95
|
loss = LigerForCausalLMLoss(
|
88
96
|
hidden_states=kept_hidden_states,
|
89
97
|
lm_head_weight=self.lm_head.weight,
|
@@ -93,7 +101,7 @@ def lce_forward(
|
|
93
101
|
**kwargs,
|
94
102
|
)
|
95
103
|
|
96
|
-
else:
|
104
|
+
else:
|
97
105
|
logits = self.lm_head(kept_hidden_states)
|
98
106
|
if labels is not None:
|
99
107
|
loss = self.loss_function(
|
@@ -25,6 +25,7 @@ def lce_forward(
|
|
25
25
|
output_router_logits: Optional[bool] = None,
|
26
26
|
cache_position: Optional[torch.LongTensor] = None,
|
27
27
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
28
|
+
skip_logits: Optional[bool] = None,
|
28
29
|
**loss_kwargs,
|
29
30
|
) -> MoeCausalLMOutputWithPast:
|
30
31
|
r"""
|
@@ -91,8 +92,10 @@ def lce_forward(
|
|
91
92
|
logits = None
|
92
93
|
loss = None
|
93
94
|
|
94
|
-
|
95
|
-
|
95
|
+
if skip_logits is None:
|
96
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
97
|
+
|
98
|
+
if skip_logits:
|
96
99
|
loss = LigerForCausalLMLoss(
|
97
100
|
hidden_states=kept_hidden_states,
|
98
101
|
lm_head_weight=self.lm_head.weight,
|