liger-kernel-nightly 0.6.2.dev20250923161735__tar.gz → 0.6.2.dev20251008084122__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/PKG-INFO +2 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/README.md +1 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/pyproject.toml +1 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/__init__.py +3 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/gemma.py +2 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/gemma2.py +8 -2
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/gemma3.py +27 -2
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/glm4.py +2 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/glm4v.py +2 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/glm4v_moe.py +2 -1
- liger_kernel_nightly-0.6.2.dev20251008084122/src/liger_kernel/transformers/model/internvl.py +150 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/llama.py +2 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/llama4.py +2 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/llava.py +6 -2
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/mistral.py +2 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/mixtral.py +8 -2
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/mllama.py +2 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/olmo2.py +2 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/paligemma.py +19 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/phi3.py +2 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/qwen2.py +2 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/qwen2_5_vl.py +7 -2
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/qwen2_vl.py +7 -2
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/qwen3.py +2 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/qwen3_moe.py +8 -2
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/smollm3.py +2 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/monkey_patch.py +80 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel_nightly.egg-info/PKG-INFO +2 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel_nightly.egg-info/SOURCES.txt +2 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/convergence/bf16/test_mini_models.py +64 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/convergence/bf16/test_mini_models_multimodal.py +101 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/convergence/bf16/test_mini_models_with_logits.py +62 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/convergence/fp32/test_mini_models.py +59 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/convergence/fp32/test_mini_models_multimodal.py +97 -1
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/convergence/fp32/test_mini_models_with_logits.py +59 -0
- liger_kernel_nightly-0.6.2.dev20251008084122/test/resources/fake_configs/OpenGVLab/InternVL3-1B-hf/tokenizer_config.json +307 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_monkey_patch.py +65 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/utils.py +30 -7
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/.github/pull_request_template.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/.github/workflows/amd-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/.github/workflows/benchmark.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/.github/workflows/docs.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/.github/workflows/intel-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/.github/workflows/nvi-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/.github/workflows/publish-nightly.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/.github/workflows/publish-release.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/.gitignore +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/LICENSE +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/Makefile +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/NOTICE +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/benchmarks_visualizer.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/data/all_benchmark_data.csv +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/benchmark_tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/benchmark/scripts/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/dev/fmt-requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/dev/modal/benchmarks.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/dev/modal/tests.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/dev/modal/tests_bwd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/Examples.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/Getting-Started.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/High-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/Low-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/acknowledgement.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/contributing.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/images/banner.GIF +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/images/compose.gif +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/images/e2e-memory.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/images/e2e-tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/images/logo-banner.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/images/patch.gif +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/images/post-training.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/index.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/docs/license.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/alignment/accelerate_config.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/alignment/run_orpo.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/callback.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/config/fsdp_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/img/gemma_7b_mem.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/img/gemma_7b_tp.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/img/llama_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/img/llama_tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/img/qwen_tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/launch_on_modal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/run_benchmarks.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/run_gemma.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/run_llama.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/run_qwen.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/run_qwen2_vl.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/training.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/huggingface/training_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/lightning/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/lightning/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/lightning/training.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/callback.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/medusa_util.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/examples/medusa/train.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/licenses/LICENSE-Apache-2.0 +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/licenses/LICENSE-MIT-AutoAWQ +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/licenses/LICENSE-MIT-llmc +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/licenses/LICENSE-MIT-triton +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/mkdocs.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/setup.cfg +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/setup.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/functional.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/env_report.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/ops/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/auto_model.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/experimental/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/fsdp.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/functional.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/trainer_integration.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/transformers/tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/triton/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/triton/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/chunked_loss/test_cosine_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/chunked_loss/test_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/chunked_loss/test_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/chunked_loss/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/chunked_loss/test_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/chunked_loss/test_kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/chunked_loss/test_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/chunked_loss/test_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/conftest.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/convergence/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/convergence/bf16/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/convergence/fp32/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/resources/tiny_shakespeare.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_auto_model.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_flex_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_trainer_integration.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_transformers.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/transformers/test_tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250923161735 → liger_kernel_nightly-0.6.2.dev20251008084122}/test/triton/test_triton_monkey_patch.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: liger_kernel_nightly
|
|
3
|
-
Version: 0.6.2.
|
|
3
|
+
Version: 0.6.2.dev20251008084122
|
|
4
4
|
Summary: Efficient Triton kernels for LLM Training
|
|
5
5
|
License: BSD 2-CLAUSE LICENSE
|
|
6
6
|
Copyright 2024 LinkedIn Corporation
|
|
@@ -311,6 +311,7 @@ loss.backward()
|
|
|
311
311
|
| Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
|
|
312
312
|
| OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
313
313
|
| GLM-4 | `liger_kernel.transformers.apply_liger_kernel_to_glm4` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
314
|
+
| InternVL3 | `liger_kernel.transformers.apply_liger_kernel_to_internvl` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
314
315
|
|
|
315
316
|
|
|
316
317
|
## Low-level APIs
|
|
@@ -263,6 +263,7 @@ loss.backward()
|
|
|
263
263
|
| Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
|
|
264
264
|
| OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
265
265
|
| GLM-4 | `liger_kernel.transformers.apply_liger_kernel_to_glm4` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
266
|
+
| InternVL3 | `liger_kernel.transformers.apply_liger_kernel_to_internvl` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
266
267
|
|
|
267
268
|
|
|
268
269
|
## Low-level APIs
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "liger_kernel_nightly"
|
|
7
|
-
version = "0.6.2.
|
|
7
|
+
version = "0.6.2.dev20251008084122"
|
|
8
8
|
description = "Efficient Triton kernels for LLM Training"
|
|
9
9
|
urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
|
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
@@ -38,6 +38,7 @@ if TYPE_CHECKING:
|
|
|
38
38
|
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4v # noqa: F401
|
|
39
39
|
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4v_moe # noqa: F401
|
|
40
40
|
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_granite # noqa: F401
|
|
41
|
+
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_internvl # noqa: F401
|
|
41
42
|
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama # noqa: F401
|
|
42
43
|
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama4 # noqa: F401
|
|
43
44
|
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llava # noqa: F401
|
|
@@ -98,6 +99,7 @@ def __getattr__(name: str):
|
|
|
98
99
|
"apply_liger_kernel_to_glm4v",
|
|
99
100
|
"apply_liger_kernel_to_glm4v_moe",
|
|
100
101
|
"apply_liger_kernel_to_granite",
|
|
102
|
+
"apply_liger_kernel_to_internvl",
|
|
101
103
|
"apply_liger_kernel_to_llama",
|
|
102
104
|
"apply_liger_kernel_to_llava",
|
|
103
105
|
"apply_liger_kernel_to_llama4",
|
|
@@ -163,6 +165,7 @@ if _TRANSFORMERS_AVAILABLE:
|
|
|
163
165
|
"apply_liger_kernel_to_glm4v",
|
|
164
166
|
"apply_liger_kernel_to_glm4v_moe",
|
|
165
167
|
"apply_liger_kernel_to_granite",
|
|
168
|
+
"apply_liger_kernel_to_internvl",
|
|
166
169
|
"apply_liger_kernel_to_llama",
|
|
167
170
|
"apply_liger_kernel_to_llava",
|
|
168
171
|
"apply_liger_kernel_to_llama4",
|
|
@@ -228,10 +228,11 @@ def lce_forward(
|
|
|
228
228
|
)
|
|
229
229
|
else:
|
|
230
230
|
logits = self.lm_head(kept_hidden_states)
|
|
231
|
-
if labels is not None:
|
|
231
|
+
if labels is not None or shift_labels is not None:
|
|
232
232
|
loss = self.loss_function(
|
|
233
233
|
logits=logits,
|
|
234
234
|
labels=labels,
|
|
235
|
+
shift_labels=shift_labels,
|
|
235
236
|
vocab_size=self.config.vocab_size,
|
|
236
237
|
**kwargs,
|
|
237
238
|
)
|
|
@@ -252,8 +252,14 @@ def lce_forward(
|
|
|
252
252
|
logits = logits * self.config.final_logit_softcapping
|
|
253
253
|
|
|
254
254
|
loss = None
|
|
255
|
-
if labels is not None:
|
|
256
|
-
loss = self.loss_function(
|
|
255
|
+
if labels is not None or shift_labels is not None:
|
|
256
|
+
loss = self.loss_function(
|
|
257
|
+
logits=logits,
|
|
258
|
+
labels=labels,
|
|
259
|
+
shift_labels=shift_labels,
|
|
260
|
+
vocab_size=self.vocab_size,
|
|
261
|
+
**kwargs,
|
|
262
|
+
)
|
|
257
263
|
|
|
258
264
|
if not return_dict:
|
|
259
265
|
output = (logits,) + outputs[1:]
|
|
@@ -119,8 +119,14 @@ def causal_forward(
|
|
|
119
119
|
logits = logits / self.config.final_logit_softcapping
|
|
120
120
|
logits = torch.tanh(logits)
|
|
121
121
|
logits = logits * self.config.final_logit_softcapping
|
|
122
|
-
if labels is not None:
|
|
123
|
-
loss = self.loss_function(
|
|
122
|
+
if labels is not None or shift_labels is not None:
|
|
123
|
+
loss = self.loss_function(
|
|
124
|
+
logits=logits,
|
|
125
|
+
labels=labels,
|
|
126
|
+
shift_labels=shift_labels,
|
|
127
|
+
vocab_size=self.vocab_size,
|
|
128
|
+
**loss_kwargs,
|
|
129
|
+
)
|
|
124
130
|
|
|
125
131
|
if not return_dict:
|
|
126
132
|
output = (logits,) + outputs[1:]
|
|
@@ -275,6 +281,25 @@ def multimodal_forward(
|
|
|
275
281
|
# Flatten the tokens
|
|
276
282
|
loss_fct = nn.CrossEntropyLoss()
|
|
277
283
|
|
|
284
|
+
flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
|
|
285
|
+
flat_labels = shift_labels.view(-1).to(shift_logits.device)
|
|
286
|
+
loss = loss_fct(flat_logits, flat_labels)
|
|
287
|
+
elif shift_labels is not None:
|
|
288
|
+
# Upcast to float if we need to compute the loss to avoid potential precision issues
|
|
289
|
+
logits = logits.float()
|
|
290
|
+
shift_logits = logits[..., :-1, :]
|
|
291
|
+
if attention_mask is not None:
|
|
292
|
+
# we use the input attention mask to shift the logits and labels, because it is 2D.
|
|
293
|
+
# we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
|
|
294
|
+
shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
|
|
295
|
+
shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
|
|
296
|
+
shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
|
|
297
|
+
else:
|
|
298
|
+
shift_logits = shift_logits.contiguous()
|
|
299
|
+
shift_labels = shift_labels.contiguous()
|
|
300
|
+
# Flatten the tokens
|
|
301
|
+
loss_fct = nn.CrossEntropyLoss()
|
|
302
|
+
|
|
278
303
|
flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
|
|
279
304
|
flat_labels = shift_labels.view(-1).to(shift_logits.device)
|
|
280
305
|
loss = loss_fct(flat_logits, flat_labels)
|
|
@@ -111,10 +111,11 @@ def lce_forward(
|
|
|
111
111
|
|
|
112
112
|
else:
|
|
113
113
|
logits = self.lm_head(kept_hidden_states)
|
|
114
|
-
if labels is not None:
|
|
114
|
+
if labels is not None or shift_labels is not None:
|
|
115
115
|
loss = self.loss_function(
|
|
116
116
|
logits=logits,
|
|
117
117
|
labels=labels,
|
|
118
|
+
shift_labels=shift_labels,
|
|
118
119
|
vocab_size=self.config.vocab_size,
|
|
119
120
|
**kwargs,
|
|
120
121
|
)
|
|
@@ -133,10 +133,11 @@ def lce_forward(
|
|
|
133
133
|
|
|
134
134
|
else:
|
|
135
135
|
logits = self.lm_head(kept_hidden_states)
|
|
136
|
-
if labels is not None:
|
|
136
|
+
if labels is not None or shift_labels is not None:
|
|
137
137
|
loss = self.loss_function(
|
|
138
138
|
logits=logits,
|
|
139
139
|
labels=labels,
|
|
140
|
+
shift_labels=shift_labels,
|
|
140
141
|
vocab_size=self.config.vocab_size,
|
|
141
142
|
**kwargs,
|
|
142
143
|
)
|
|
@@ -134,10 +134,11 @@ def lce_forward(
|
|
|
134
134
|
|
|
135
135
|
else:
|
|
136
136
|
logits = self.lm_head(kept_hidden_states)
|
|
137
|
-
if labels is not None:
|
|
137
|
+
if labels is not None or shift_labels is not None:
|
|
138
138
|
loss = self.loss_function(
|
|
139
139
|
logits=logits,
|
|
140
140
|
labels=labels,
|
|
141
|
+
shift_labels=shift_labels,
|
|
141
142
|
vocab_size=self.config.vocab_size,
|
|
142
143
|
**kwargs,
|
|
143
144
|
)
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from typing import Tuple
|
|
4
|
+
from typing import Union
|
|
5
|
+
|
|
6
|
+
import torch
|
|
7
|
+
|
|
8
|
+
from transformers.models.internvl.modeling_internvl import InternVLCausalLMOutputWithPast
|
|
9
|
+
from transformers.utils import can_return_tuple
|
|
10
|
+
|
|
11
|
+
from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Copied from https://github.com/huggingface/transformers/blob/d888bd435d0c0eaabaabad5b33d52af518c7187c/src/transformers/models/internvl/modeling_internvl.py#L862
|
|
15
|
+
@can_return_tuple
|
|
16
|
+
def lce_forward(
|
|
17
|
+
self,
|
|
18
|
+
input_ids: torch.LongTensor = None,
|
|
19
|
+
pixel_values: Optional[torch.FloatTensor] = None,
|
|
20
|
+
attention_mask: Optional[torch.Tensor] = None,
|
|
21
|
+
position_ids: Optional[torch.LongTensor] = None,
|
|
22
|
+
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
|
23
|
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
24
|
+
vision_feature_layer: Optional[Union[int, List[int]]] = None,
|
|
25
|
+
vision_feature_select_strategy: Optional[str] = None,
|
|
26
|
+
labels: Optional[torch.LongTensor] = None,
|
|
27
|
+
use_cache: Optional[bool] = None,
|
|
28
|
+
output_attentions: Optional[bool] = None,
|
|
29
|
+
output_hidden_states: Optional[bool] = None,
|
|
30
|
+
return_dict: Optional[bool] = None,
|
|
31
|
+
cache_position: Optional[torch.LongTensor] = None,
|
|
32
|
+
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
33
|
+
image_sizes: Optional[torch.Tensor] = None,
|
|
34
|
+
skip_logits: Optional[bool] = None, # Added argument for liger-kernel
|
|
35
|
+
**lm_kwargs, # renamed from kwargs
|
|
36
|
+
) -> Union[Tuple, InternVLCausalLMOutputWithPast]:
|
|
37
|
+
r"""
|
|
38
|
+
Example:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
>>> import torch
|
|
42
|
+
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
|
|
43
|
+
|
|
44
|
+
>>> torch_device = "cuda"
|
|
45
|
+
>>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
|
|
46
|
+
>>> model = AutoModelForImageTextToText.from_pretrained(
|
|
47
|
+
... "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
|
|
48
|
+
... )
|
|
49
|
+
|
|
50
|
+
>>> messages = [
|
|
51
|
+
... {
|
|
52
|
+
... "role": "user",
|
|
53
|
+
... "content": [
|
|
54
|
+
... {
|
|
55
|
+
... "type": "image",
|
|
56
|
+
... "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
|
|
57
|
+
... },
|
|
58
|
+
... {
|
|
59
|
+
... "type": "image",
|
|
60
|
+
... "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
|
|
61
|
+
... },
|
|
62
|
+
... {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
|
|
63
|
+
... ],
|
|
64
|
+
... },
|
|
65
|
+
... ]
|
|
66
|
+
|
|
67
|
+
>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
|
|
68
|
+
>>> generate_ids = model.generate(**inputs, max_new_tokens=200)
|
|
69
|
+
>>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
|
|
70
|
+
The images depict the Statue of Liberty and the Golden Gate Bridge.
|
|
71
|
+
```"""
|
|
72
|
+
|
|
73
|
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
74
|
+
output_hidden_states = (
|
|
75
|
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
76
|
+
)
|
|
77
|
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
78
|
+
vision_feature_layer = (
|
|
79
|
+
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
|
80
|
+
)
|
|
81
|
+
vision_feature_select_strategy = (
|
|
82
|
+
vision_feature_select_strategy
|
|
83
|
+
if vision_feature_select_strategy is not None
|
|
84
|
+
else self.config.vision_feature_select_strategy
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
outputs = self.model(
|
|
88
|
+
input_ids=input_ids,
|
|
89
|
+
pixel_values=pixel_values,
|
|
90
|
+
attention_mask=attention_mask,
|
|
91
|
+
position_ids=position_ids,
|
|
92
|
+
past_key_values=past_key_values,
|
|
93
|
+
inputs_embeds=inputs_embeds,
|
|
94
|
+
vision_feature_layer=vision_feature_layer,
|
|
95
|
+
vision_feature_select_strategy=vision_feature_select_strategy,
|
|
96
|
+
use_cache=use_cache,
|
|
97
|
+
output_attentions=output_attentions,
|
|
98
|
+
output_hidden_states=output_hidden_states,
|
|
99
|
+
return_dict=return_dict,
|
|
100
|
+
cache_position=cache_position,
|
|
101
|
+
image_sizes=image_sizes,
|
|
102
|
+
**lm_kwargs,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Copied from llava.py
|
|
106
|
+
hidden_states = outputs[0]
|
|
107
|
+
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
|
108
|
+
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
|
|
109
|
+
kept_hidden_states = hidden_states[:, slice_indices, :]
|
|
110
|
+
|
|
111
|
+
shift_labels = lm_kwargs.pop("shift_labels", None)
|
|
112
|
+
logits = None
|
|
113
|
+
loss = None
|
|
114
|
+
|
|
115
|
+
if skip_logits and labels is None and shift_labels is None:
|
|
116
|
+
raise ValueError("skip_logits is True, but labels and shift_labels are None")
|
|
117
|
+
|
|
118
|
+
if skip_logits is None:
|
|
119
|
+
# By default, if in training mode, don't materialize logits
|
|
120
|
+
skip_logits = self.training and (labels is not None or shift_labels is not None)
|
|
121
|
+
|
|
122
|
+
if skip_logits:
|
|
123
|
+
loss = LigerForCausalLMLoss(
|
|
124
|
+
hidden_states=kept_hidden_states,
|
|
125
|
+
lm_head_weight=self.lm_head.weight,
|
|
126
|
+
labels=labels,
|
|
127
|
+
shift_labels=shift_labels,
|
|
128
|
+
hidden_size=self.config.text_config.hidden_size,
|
|
129
|
+
**lm_kwargs,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
else:
|
|
133
|
+
logits = self.lm_head(kept_hidden_states)
|
|
134
|
+
if labels is not None:
|
|
135
|
+
loss = self.loss_function(
|
|
136
|
+
logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **lm_kwargs
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
if not return_dict:
|
|
140
|
+
output = (logits,) + outputs[1:]
|
|
141
|
+
return (loss,) + output if loss is not None else output
|
|
142
|
+
|
|
143
|
+
return InternVLCausalLMOutputWithPast(
|
|
144
|
+
loss=loss,
|
|
145
|
+
logits=logits,
|
|
146
|
+
past_key_values=outputs.past_key_values,
|
|
147
|
+
hidden_states=outputs.hidden_states,
|
|
148
|
+
attentions=outputs.attentions,
|
|
149
|
+
image_hidden_states=outputs.image_hidden_states,
|
|
150
|
+
)
|
|
@@ -248,10 +248,11 @@ def lce_forward(
|
|
|
248
248
|
|
|
249
249
|
else:
|
|
250
250
|
logits = self.lm_head(kept_hidden_states)
|
|
251
|
-
if labels is not None:
|
|
251
|
+
if labels is not None or shift_labels is not None:
|
|
252
252
|
loss = self.loss_function(
|
|
253
253
|
logits=logits,
|
|
254
254
|
labels=labels,
|
|
255
|
+
shift_labels=shift_labels,
|
|
255
256
|
vocab_size=self.config.vocab_size,
|
|
256
257
|
**kwargs,
|
|
257
258
|
)
|
|
@@ -91,10 +91,11 @@ def lce_forward(
|
|
|
91
91
|
|
|
92
92
|
else: # if in inference mode materialize logits
|
|
93
93
|
logits = self.lm_head(kept_hidden_states)
|
|
94
|
-
if labels is not None:
|
|
94
|
+
if labels is not None or shift_labels is not None:
|
|
95
95
|
loss = self.loss_function(
|
|
96
96
|
logits=logits,
|
|
97
97
|
labels=labels,
|
|
98
|
+
shift_labels=shift_labels,
|
|
98
99
|
vocab_size=self.config.vocab_size,
|
|
99
100
|
**kwargs,
|
|
100
101
|
)
|
|
@@ -313,9 +313,13 @@ def lce_forward(
|
|
|
313
313
|
|
|
314
314
|
else:
|
|
315
315
|
logits = self.lm_head(kept_hidden_states)
|
|
316
|
-
if labels is not None:
|
|
316
|
+
if labels is not None or shift_labels is not None:
|
|
317
317
|
loss = self.loss_function(
|
|
318
|
-
logits=logits,
|
|
318
|
+
logits=logits,
|
|
319
|
+
labels=labels,
|
|
320
|
+
shift_labels=shift_labels,
|
|
321
|
+
vocab_size=self.config.text_config.vocab_size,
|
|
322
|
+
**lm_kwargs,
|
|
319
323
|
)
|
|
320
324
|
|
|
321
325
|
if not return_dict:
|
|
@@ -115,10 +115,11 @@ def lce_forward(
|
|
|
115
115
|
logits = self.lm_head(kept_hidden_states)
|
|
116
116
|
|
|
117
117
|
loss = None
|
|
118
|
-
if labels is not None:
|
|
118
|
+
if labels is not None or shift_labels is not None:
|
|
119
119
|
loss = self.loss_function(
|
|
120
120
|
logits=logits,
|
|
121
121
|
labels=labels,
|
|
122
|
+
shift_labels=shift_labels,
|
|
122
123
|
vocab_size=self.config.vocab_size,
|
|
123
124
|
**kwargs,
|
|
124
125
|
)
|
|
@@ -248,8 +248,14 @@ def lce_forward(
|
|
|
248
248
|
logits = self.lm_head(kept_hidden_states)
|
|
249
249
|
|
|
250
250
|
loss = None
|
|
251
|
-
if labels is not None:
|
|
252
|
-
loss = self.loss_function(
|
|
251
|
+
if labels is not None or shift_labels is not None:
|
|
252
|
+
loss = self.loss_function(
|
|
253
|
+
logits=logits,
|
|
254
|
+
labels=labels,
|
|
255
|
+
shift_labels=shift_labels,
|
|
256
|
+
vocab_size=self.vocab_size,
|
|
257
|
+
**kwargs,
|
|
258
|
+
)
|
|
253
259
|
aux_loss = None
|
|
254
260
|
if output_router_logits:
|
|
255
261
|
aux_loss = load_balancing_loss_func(
|
|
@@ -239,10 +239,11 @@ def lce_forward(
|
|
|
239
239
|
|
|
240
240
|
else:
|
|
241
241
|
logits = self.lm_head(kept_hidden_states)
|
|
242
|
-
if labels is not None:
|
|
242
|
+
if labels is not None or shift_labels is not None:
|
|
243
243
|
loss = self.loss_function(
|
|
244
244
|
logits=logits,
|
|
245
245
|
labels=labels,
|
|
246
|
+
shift_labels=shift_labels,
|
|
246
247
|
vocab_size=self.config.vocab_size,
|
|
247
248
|
**kwargs,
|
|
248
249
|
)
|
|
@@ -111,10 +111,11 @@ def lce_forward(
|
|
|
111
111
|
|
|
112
112
|
else:
|
|
113
113
|
logits = self.lm_head(kept_hidden_states)
|
|
114
|
-
if labels is not None:
|
|
114
|
+
if labels is not None or shift_labels is not None:
|
|
115
115
|
loss = self.loss_function(
|
|
116
116
|
logits=logits,
|
|
117
117
|
labels=labels,
|
|
118
|
+
shift_labels=shift_labels,
|
|
118
119
|
vocab_size=self.config.vocab_size,
|
|
119
120
|
**kwargs,
|
|
120
121
|
)
|
|
@@ -379,6 +379,25 @@ def lce_forward(
|
|
|
379
379
|
# Flatten the tokens
|
|
380
380
|
loss_fct = CrossEntropyLoss()
|
|
381
381
|
|
|
382
|
+
flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
|
|
383
|
+
flat_labels = shift_labels.view(-1).to(shift_logits.device)
|
|
384
|
+
loss = loss_fct(flat_logits, flat_labels)
|
|
385
|
+
elif shift_labels is not None:
|
|
386
|
+
# Upcast to float if we need to compute the loss to avoid potential precision issues
|
|
387
|
+
logits = logits.float()
|
|
388
|
+
shift_logits = logits[..., :-1, :]
|
|
389
|
+
if attention_mask is not None:
|
|
390
|
+
# we use the input attention mask to shift the logits and labels, because it is 2D.
|
|
391
|
+
# we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
|
|
392
|
+
shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
|
|
393
|
+
shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
|
|
394
|
+
shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
|
|
395
|
+
else:
|
|
396
|
+
shift_logits = shift_logits.contiguous()
|
|
397
|
+
shift_labels = shift_labels.contiguous()
|
|
398
|
+
# Flatten the tokens
|
|
399
|
+
loss_fct = CrossEntropyLoss()
|
|
400
|
+
|
|
382
401
|
flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
|
|
383
402
|
flat_labels = shift_labels.view(-1).to(shift_logits.device)
|
|
384
403
|
loss = loss_fct(flat_logits, flat_labels)
|
|
@@ -91,10 +91,11 @@ def lce_forward(
|
|
|
91
91
|
|
|
92
92
|
else:
|
|
93
93
|
logits = self.lm_head(kept_hidden_states)
|
|
94
|
-
if labels is not None:
|
|
94
|
+
if labels is not None or shift_labels is not None:
|
|
95
95
|
loss = self.loss_function(
|
|
96
96
|
logits=logits,
|
|
97
97
|
labels=labels,
|
|
98
|
+
shift_labels=shift_labels,
|
|
98
99
|
vocab_size=self.config.vocab_size,
|
|
99
100
|
**kwargs,
|
|
100
101
|
)
|
|
@@ -228,10 +228,11 @@ def lce_forward(
|
|
|
228
228
|
|
|
229
229
|
else:
|
|
230
230
|
logits = self.lm_head(kept_hidden_states)
|
|
231
|
-
if labels is not None:
|
|
231
|
+
if labels is not None or shift_labels is not None:
|
|
232
232
|
loss = self.loss_function(
|
|
233
233
|
logits=logits,
|
|
234
234
|
labels=labels,
|
|
235
|
+
shift_labels=shift_labels,
|
|
235
236
|
vocab_size=self.config.vocab_size,
|
|
236
237
|
**kwargs,
|
|
237
238
|
)
|
|
@@ -133,8 +133,13 @@ def lce_forward(
|
|
|
133
133
|
logits = self.lm_head(hidden_states)
|
|
134
134
|
|
|
135
135
|
loss = None
|
|
136
|
-
if labels is not None:
|
|
137
|
-
loss = self.loss_function(
|
|
136
|
+
if labels is not None or shift_labels is not None:
|
|
137
|
+
loss = self.loss_function(
|
|
138
|
+
logits=logits,
|
|
139
|
+
labels=labels,
|
|
140
|
+
shift_labels=shift_labels,
|
|
141
|
+
vocab_size=self.config.vocab_size,
|
|
142
|
+
)
|
|
138
143
|
|
|
139
144
|
if not return_dict:
|
|
140
145
|
output = (logits,) + outputs[1:]
|
|
@@ -129,8 +129,13 @@ def lce_forward(
|
|
|
129
129
|
logits = self.lm_head(hidden_states)
|
|
130
130
|
|
|
131
131
|
loss = None
|
|
132
|
-
if labels is not None:
|
|
133
|
-
loss = self.loss_function(
|
|
132
|
+
if labels is not None or shift_labels is not None:
|
|
133
|
+
loss = self.loss_function(
|
|
134
|
+
logits=logits,
|
|
135
|
+
labels=labels,
|
|
136
|
+
shift_labels=shift_labels,
|
|
137
|
+
vocab_size=self.config.vocab_size,
|
|
138
|
+
)
|
|
134
139
|
|
|
135
140
|
return Qwen2VLCausalLMOutputWithPast(
|
|
136
141
|
loss=loss,
|
|
@@ -103,10 +103,11 @@ def lce_forward(
|
|
|
103
103
|
|
|
104
104
|
else:
|
|
105
105
|
logits = self.lm_head(kept_hidden_states)
|
|
106
|
-
if labels is not None:
|
|
106
|
+
if labels is not None or shift_labels is not None:
|
|
107
107
|
loss = self.loss_function(
|
|
108
108
|
logits=logits,
|
|
109
109
|
labels=labels,
|
|
110
|
+
shift_labels=shift_labels,
|
|
110
111
|
vocab_size=self.config.vocab_size,
|
|
111
112
|
**kwargs,
|
|
112
113
|
)
|
|
@@ -107,8 +107,14 @@ def lce_forward(
|
|
|
107
107
|
)
|
|
108
108
|
else: # if in inference model materialize logits
|
|
109
109
|
logits = self.lm_head(kept_hidden_states)
|
|
110
|
-
if labels is not None:
|
|
111
|
-
loss = self.loss_function(
|
|
110
|
+
if labels is not None or shift_labels is not None:
|
|
111
|
+
loss = self.loss_function(
|
|
112
|
+
logits=logits,
|
|
113
|
+
labels=labels,
|
|
114
|
+
shift_labels=shift_labels,
|
|
115
|
+
vocab_size=self.vocab_size,
|
|
116
|
+
**kwargs,
|
|
117
|
+
)
|
|
112
118
|
|
|
113
119
|
aux_loss = None
|
|
114
120
|
if output_router_logits:
|
|
@@ -121,10 +121,11 @@ def lce_forward(
|
|
|
121
121
|
|
|
122
122
|
else:
|
|
123
123
|
logits = self.lm_head(kept_hidden_states)
|
|
124
|
-
if labels is not None:
|
|
124
|
+
if labels is not None or shift_labels is not None:
|
|
125
125
|
loss = self.loss_function(
|
|
126
126
|
logits=logits,
|
|
127
127
|
labels=labels,
|
|
128
|
+
shift_labels=shift_labels,
|
|
128
129
|
vocab_size=self.config.vocab_size,
|
|
129
130
|
**kwargs,
|
|
130
131
|
)
|