liger-kernel-nightly 0.5.5.dev20250318140935__tar.gz → 0.5.5.dev20250320214749__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of liger-kernel-nightly might be problematic. Click here for more details.
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/PKG-INFO +2 -1
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/README.md +1 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/pyproject.toml +1 -1
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/paligemma.py +184 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/monkey_patch.py +39 -13
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel_nightly.egg-info/PKG-INFO +2 -1
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/bf16/test_mini_models_multimodal.py +75 -3
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/fp32/test_mini_models_multimodal.py +74 -3
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/utils.py +3 -1
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/pull_request_template.md +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/workflows/amd-ci.yml +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/workflows/docs.yml +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/workflows/intel-ci.yml +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/workflows/nvi-ci.yml +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/workflows/publish-nightly.yml +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/.github/workflows/publish-release.yml +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/.gitignore +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/LICENSE +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/Makefile +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/NOTICE +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/README.md +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/__init__.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/benchmarks_visualizer.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/data/all_benchmark_data.csv +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/__init__.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_embedding.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_geglu.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_group_norm.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_jsd.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_kl_div.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_kto_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_layer_norm.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_rms_norm.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_rope.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_swiglu.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/benchmark_tvd.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/benchmark/scripts/utils.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/dev/fmt-requirements.txt +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/dev/modal/tests.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/dev/modal/tests_bwd.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/Examples.md +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/Getting-Started.md +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/High-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/Low-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/acknowledgement.md +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/contributing.md +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/images/banner.GIF +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/images/compose.gif +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/images/e2e-memory.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/images/e2e-tps.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/images/logo-banner.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/images/patch.gif +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/images/post-training.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/index.md +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/docs/license.md +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/alignment/accelerate_config.yaml +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/alignment/run_orpo.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/README.md +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/callback.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/config/fsdp_config.json +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/img/gemma_7b_mem.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/img/gemma_7b_tp.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/img/llama_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/img/llama_tps.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/img/qwen_tps.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/launch_on_modal.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/requirements.txt +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/run_benchmarks.sh +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/run_gemma.sh +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/run_llama.sh +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/run_qwen.sh +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/run_qwen2_vl.sh +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/training.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/huggingface/training_multimodal.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/lightning/README.md +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/lightning/requirements.txt +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/lightning/training.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/README.md +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/callback.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/medusa_util.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/requirements.txt +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/examples/medusa/train.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/licenses/LICENSE-Apache-2.0 +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/licenses/LICENSE-MIT-AutoAWQ +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/licenses/LICENSE-MIT-llmc +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/licenses/LICENSE-MIT-triton +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/mkdocs.yml +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/setup.cfg +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/setup.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/__init__.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/README.md +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/functional.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/fused_linear_rlhf.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/env_report.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/__init__.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/geglu.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/group_norm.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/jsd.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/kl_div.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/layer_norm.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/rms_norm.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/rope.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/swiglu.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/tvd.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/ops/utils.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/__init__.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/auto_model.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/functional.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/geglu.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/group_norm.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/jsd.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/kl_div.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/layer_norm.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/__init__.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/gemma.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/gemma2.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/llama.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/mistral.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/mixtral.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/mllama.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/olmo2.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/phi3.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/qwen2.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/rms_norm.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/rope.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/swiglu.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/trainer_integration.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/transformers/tvd.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/triton/__init__.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/triton/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel/utils.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel_nightly.egg-info/SOURCES.txt +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/__init__.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/test_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/test_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/test_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/test_kto_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/test_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/chunked_loss/test_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/conftest.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/__init__.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/bf16/__init__.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/bf16/test_mini_models.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/bf16/test_mini_models_with_logits.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/fp32/__init__.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/fp32/test_mini_models.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/convergence/fp32/test_mini_models_with_logits.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/tiny_shakespeare.txt +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_auto_model.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_embedding.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_flex_attention.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_geglu.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_group_norm.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_jsd.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_kl_div.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_layer_norm.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_monkey_patch.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_rms_norm.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_rope.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_swiglu.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_trainer_integration.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_transformers.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/transformers/test_tvd.py +0 -0
- {liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/test/triton/test_triton_monkey_patch.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: liger_kernel_nightly
|
|
3
|
-
Version: 0.5.5.
|
|
3
|
+
Version: 0.5.5.dev20250320214749
|
|
4
4
|
Summary: Efficient Triton kernels for LLM Training
|
|
5
5
|
License: BSD 2-CLAUSE LICENSE
|
|
6
6
|
Copyright 2024 LinkedIn Corporation
|
|
@@ -113,6 +113,7 @@ Requires-Dist: mkdocs-material; extra == "dev"
|
|
|
113
113
|
<details>
|
|
114
114
|
<summary>Latest News 🔥</summary>
|
|
115
115
|
|
|
116
|
+
- [2025/03/06] We release a joint blog post on TorchTune × Liger - [Peak Performance, Minimized Memory: Optimizing torchtune’s performance with torch.compile & Liger Kernel](https://pytorch.org/blog/peak-performance-minimized-memory/)
|
|
116
117
|
- [2024/12/11] We release [v0.5.0](https://github.com/linkedin/Liger-Kernel/releases/tag/v0.5.0): 80% more memory efficient post training losses (DPO, ORPO, CPO, etc)!
|
|
117
118
|
- [2024/12/5] We release LinkedIn Engineering Blog - [Liger-Kernel: Empowering an open source ecosystem of Triton Kernels for Efficient LLM Training](https://www.linkedin.com/blog/engineering/open-source/liger-kernel-open-source-ecosystem-for-efficient-llm-training)
|
|
118
119
|
- [2024/11/6] We release [v0.4.0](https://github.com/linkedin/Liger-Kernel/releases/tag/v0.4.0): Full AMD support, Tech Report, Modal CI, Llama-3.2-Vision!
|
|
@@ -65,6 +65,7 @@
|
|
|
65
65
|
<details>
|
|
66
66
|
<summary>Latest News 🔥</summary>
|
|
67
67
|
|
|
68
|
+
- [2025/03/06] We release a joint blog post on TorchTune × Liger - [Peak Performance, Minimized Memory: Optimizing torchtune’s performance with torch.compile & Liger Kernel](https://pytorch.org/blog/peak-performance-minimized-memory/)
|
|
68
69
|
- [2024/12/11] We release [v0.5.0](https://github.com/linkedin/Liger-Kernel/releases/tag/v0.5.0): 80% more memory efficient post training losses (DPO, ORPO, CPO, etc)!
|
|
69
70
|
- [2024/12/5] We release LinkedIn Engineering Blog - [Liger-Kernel: Empowering an open source ecosystem of Triton Kernels for Efficient LLM Training](https://www.linkedin.com/blog/engineering/open-source/liger-kernel-open-source-ecosystem-for-efficient-llm-training)
|
|
70
71
|
- [2024/11/6] We release [v0.4.0](https://github.com/linkedin/Liger-Kernel/releases/tag/v0.4.0): Full AMD support, Tech Report, Modal CI, Llama-3.2-Vision!
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "liger_kernel_nightly"
|
|
7
|
-
version = "0.5.5.
|
|
7
|
+
version = "0.5.5.dev20250320214749"
|
|
8
8
|
description = "Efficient Triton kernels for LLM Training"
|
|
9
9
|
urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
|
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
@@ -21,6 +21,190 @@ from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinea
|
|
|
21
21
|
logger = logging.get_logger(__name__)
|
|
22
22
|
|
|
23
23
|
|
|
24
|
+
@add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING)
|
|
25
|
+
@replace_return_docstrings(output_type=PaliGemmaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
|
26
|
+
def lce_forward_deprecated(
|
|
27
|
+
self,
|
|
28
|
+
input_ids: torch.LongTensor = None,
|
|
29
|
+
pixel_values: torch.FloatTensor = None,
|
|
30
|
+
attention_mask: Optional[torch.Tensor] = None,
|
|
31
|
+
position_ids: Optional[torch.LongTensor] = None,
|
|
32
|
+
past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
|
|
33
|
+
token_type_ids: Optional[torch.LongTensor] = None,
|
|
34
|
+
cache_position: Optional[torch.LongTensor] = None,
|
|
35
|
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
36
|
+
labels: Optional[torch.LongTensor] = None,
|
|
37
|
+
use_cache: Optional[bool] = None,
|
|
38
|
+
output_attentions: Optional[bool] = None,
|
|
39
|
+
output_hidden_states: Optional[bool] = None,
|
|
40
|
+
return_dict: Optional[bool] = None,
|
|
41
|
+
) -> Union[Tuple, PaliGemmaCausalLMOutputWithPast]:
|
|
42
|
+
r"""
|
|
43
|
+
Args:
|
|
44
|
+
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
45
|
+
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
|
46
|
+
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
|
47
|
+
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
|
|
51
|
+
Example:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
>>> from PIL import Image
|
|
55
|
+
>>> import requests
|
|
56
|
+
>>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
|
|
57
|
+
|
|
58
|
+
>>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/PaliGemma-test-224px-hf")
|
|
59
|
+
>>> processor = AutoProcessor.from_pretrained("google/PaliGemma-test-224px-hf")
|
|
60
|
+
|
|
61
|
+
>>> prompt = "answer en Where is the cow standing?"
|
|
62
|
+
>>> url = "https://huggingface.co/gv-hf/PaliGemma-test-224px-hf/resolve/main/cow_beach_1.png"
|
|
63
|
+
>>> image = Image.open(requests.get(url, stream=True).raw)
|
|
64
|
+
|
|
65
|
+
>>> inputs = processor(text=prompt, images=image, return_tensors="pt")
|
|
66
|
+
|
|
67
|
+
>>> # Generate
|
|
68
|
+
>>> generate_ids = model.generate(**inputs, max_length=30)
|
|
69
|
+
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
70
|
+
"answer en Where is the cow standing?\nbeach"
|
|
71
|
+
```"""
|
|
72
|
+
|
|
73
|
+
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
74
|
+
raise ValueError(
|
|
75
|
+
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
79
|
+
output_hidden_states = (
|
|
80
|
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
81
|
+
)
|
|
82
|
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
83
|
+
|
|
84
|
+
# the attention mask is turned 4d after, we keep track of the original one
|
|
85
|
+
input_attention_mask = attention_mask
|
|
86
|
+
|
|
87
|
+
if inputs_embeds is None:
|
|
88
|
+
# 1. Extra the input embeddings
|
|
89
|
+
inputs_embeds = self.get_input_embeddings()(input_ids)
|
|
90
|
+
|
|
91
|
+
# 2. Merge text and images
|
|
92
|
+
if pixel_values is not None and input_ids.shape[1] != 1:
|
|
93
|
+
image_outputs = self.vision_tower(pixel_values.to(inputs_embeds.dtype))
|
|
94
|
+
selected_image_feature = image_outputs.last_hidden_state
|
|
95
|
+
image_features = self.multi_modal_projector(selected_image_feature)
|
|
96
|
+
|
|
97
|
+
if cache_position is None:
|
|
98
|
+
cache_position = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device)
|
|
99
|
+
inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
|
|
100
|
+
image_features, inputs_embeds, input_ids, attention_mask, labels, token_type_ids, cache_position
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
else:
|
|
104
|
+
# In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
|
|
105
|
+
# generation with cache
|
|
106
|
+
if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
|
|
107
|
+
# Retrieve the first layer to inspect the logits and mask out the hidden states
|
|
108
|
+
# that are set to 0
|
|
109
|
+
# TODO @molbap this will only work for dynamic cache.
|
|
110
|
+
first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
|
|
111
|
+
|
|
112
|
+
# Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
|
|
113
|
+
batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
|
|
114
|
+
|
|
115
|
+
# Get the target length
|
|
116
|
+
target_seqlen = cache_position[-1] + 1
|
|
117
|
+
extended_attention_mask = torch.ones(
|
|
118
|
+
(attention_mask.shape[0], target_seqlen - attention_mask.shape[1] + 1),
|
|
119
|
+
dtype=attention_mask.dtype,
|
|
120
|
+
device=attention_mask.device,
|
|
121
|
+
)
|
|
122
|
+
# Filter out only the tokens that can be un-attended, this can happen
|
|
123
|
+
# if one uses PaliGemma+ Fused modules where the cache on the
|
|
124
|
+
# first iteration is already big enough, or if one passes custom cache
|
|
125
|
+
valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
|
|
126
|
+
new_batch_index = batch_index[valid_indices]
|
|
127
|
+
new_non_attended_tokens = non_attended_tokens[valid_indices]
|
|
128
|
+
|
|
129
|
+
# Zero-out the places where we don't need to attend
|
|
130
|
+
extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
|
|
131
|
+
|
|
132
|
+
attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
|
|
133
|
+
position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
|
|
134
|
+
|
|
135
|
+
attention_mask = attention_mask.to(inputs_embeds.dtype)
|
|
136
|
+
outputs = self.language_model.model(
|
|
137
|
+
attention_mask=attention_mask,
|
|
138
|
+
position_ids=position_ids,
|
|
139
|
+
past_key_values=past_key_values,
|
|
140
|
+
inputs_embeds=inputs_embeds,
|
|
141
|
+
use_cache=use_cache,
|
|
142
|
+
output_attentions=output_attentions,
|
|
143
|
+
output_hidden_states=output_hidden_states,
|
|
144
|
+
return_dict=return_dict,
|
|
145
|
+
cache_position=cache_position,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
hidden_states = outputs[0]
|
|
149
|
+
|
|
150
|
+
loss = None
|
|
151
|
+
logits = None
|
|
152
|
+
|
|
153
|
+
if self.training and (labels is not None):
|
|
154
|
+
shift_hidden_states = hidden_states[..., :-1, :]
|
|
155
|
+
shift_labels = labels[..., 1:]
|
|
156
|
+
|
|
157
|
+
hidden_device = shift_hidden_states.device
|
|
158
|
+
|
|
159
|
+
if attention_mask is not None:
|
|
160
|
+
# we use the input attention mask to shift the hidden_states and labels, because it is 2D.
|
|
161
|
+
# we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
|
|
162
|
+
shift_attention_mask = attention_mask[:, -shift_hidden_states.shape[1] :].to(hidden_device)
|
|
163
|
+
shift_hidden_states = shift_hidden_states[shift_attention_mask.to(hidden_device) != 0].contiguous()
|
|
164
|
+
shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
|
|
165
|
+
else:
|
|
166
|
+
shift_hidden_states = shift_hidden_states.contiguous()
|
|
167
|
+
shift_labels = shift_labels.contiguous()
|
|
168
|
+
|
|
169
|
+
# Flatten hidden state
|
|
170
|
+
shift_hidden_states = shift_hidden_states.view(-1, self.config.text_config.hidden_size)
|
|
171
|
+
shift_labels = shift_labels.view(-1).to(hidden_device)
|
|
172
|
+
|
|
173
|
+
lce = LigerFusedLinearCrossEntropyLoss()
|
|
174
|
+
loss = lce(self.language_model.lm_head.weight, shift_hidden_states, shift_labels)
|
|
175
|
+
|
|
176
|
+
else:
|
|
177
|
+
logits = self.language_model.lm_head(hidden_states)
|
|
178
|
+
if labels is not None:
|
|
179
|
+
shift_logits = logits[..., :-1, :]
|
|
180
|
+
shift_labels = labels[..., 1:]
|
|
181
|
+
if input_attention_mask is not None:
|
|
182
|
+
# we use the input attention mask to shift the logits and labels, because it is 2D.
|
|
183
|
+
shift_attention_mask = input_attention_mask[..., 1:]
|
|
184
|
+
shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
|
|
185
|
+
shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
|
|
186
|
+
else:
|
|
187
|
+
shift_logits = shift_logits.contiguous()
|
|
188
|
+
shift_labels = shift_labels.contiguous()
|
|
189
|
+
# Flatten the tokens
|
|
190
|
+
loss_fct = CrossEntropyLoss()
|
|
191
|
+
|
|
192
|
+
flat_logits = shift_logits.view(-1, self.config.vocab_size)
|
|
193
|
+
flat_labels = shift_labels.view(-1).to(shift_logits.device)
|
|
194
|
+
loss = loss_fct(flat_logits, flat_labels)
|
|
195
|
+
if not return_dict:
|
|
196
|
+
output = (logits,) + outputs[1:]
|
|
197
|
+
return (loss,) + output if loss is not None else output
|
|
198
|
+
|
|
199
|
+
return PaliGemmaCausalLMOutputWithPast(
|
|
200
|
+
loss=loss,
|
|
201
|
+
logits=logits,
|
|
202
|
+
past_key_values=outputs.past_key_values,
|
|
203
|
+
hidden_states=outputs.hidden_states,
|
|
204
|
+
attentions=outputs.attentions,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
|
|
24
208
|
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
|
25
209
|
@add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING)
|
|
26
210
|
@replace_return_docstrings(output_type=PaliGemmaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
|
@@ -631,6 +631,7 @@ def apply_liger_kernel_to_paligemma(
|
|
|
631
631
|
|
|
632
632
|
# PaliGemma submodules are ['vision_tower', 'multi_modal_projector', 'language_model']
|
|
633
633
|
|
|
634
|
+
from transformers.models.gemma.modeling_gemma import GemmaForCausalLM
|
|
634
635
|
from transformers.models.gemma2.modeling_gemma2 import Gemma2ForCausalLM
|
|
635
636
|
from transformers.models.paligemma import modeling_paligemma
|
|
636
637
|
from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
|
|
@@ -639,6 +640,7 @@ def apply_liger_kernel_to_paligemma(
|
|
|
639
640
|
from transformers.models.siglip.modeling_siglip import SiglipVisionModel
|
|
640
641
|
|
|
641
642
|
from liger_kernel.transformers.model.paligemma import lce_forward
|
|
643
|
+
from liger_kernel.transformers.model.paligemma import lce_forward_deprecated
|
|
642
644
|
|
|
643
645
|
# The vision_tower is a SiglipVisionModel
|
|
644
646
|
if layer_norm:
|
|
@@ -647,13 +649,22 @@ def apply_liger_kernel_to_paligemma(
|
|
|
647
649
|
# SiglipMLP is standard FFN so LigerGEGLUMLP is not compatible
|
|
648
650
|
# The multi_modal_projector is Linear, nothing to do
|
|
649
651
|
|
|
650
|
-
# The language_model is Gemma2ForCausalLM
|
|
651
|
-
|
|
652
|
+
# The language_model is GemmaForCausalLM or Gemma2ForCausalLM
|
|
653
|
+
apply_liger_kernel_to_gemma(
|
|
654
|
+
rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, rms_norm=rms_norm, geglu=geglu
|
|
655
|
+
)
|
|
656
|
+
apply_liger_kernel_to_gemma2(
|
|
657
|
+
rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, rms_norm=rms_norm, geglu=geglu
|
|
658
|
+
)
|
|
652
659
|
# Handle loss function
|
|
653
660
|
if cross_entropy:
|
|
654
661
|
modeling_paligemma.nn.CrossEntropyLoss = LigerCrossEntropyLoss
|
|
655
662
|
if fused_linear_cross_entropy:
|
|
656
|
-
|
|
663
|
+
if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
|
|
664
|
+
modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward
|
|
665
|
+
else: # if version < 4.46.1
|
|
666
|
+
logger.warning(TRANSFORMER_DEPRECATION_WARNING)
|
|
667
|
+
modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward_deprecated
|
|
657
668
|
|
|
658
669
|
if model is not None:
|
|
659
670
|
# The model instance already exists, so we need to additionally patch the
|
|
@@ -672,16 +683,31 @@ def apply_liger_kernel_to_paligemma(
|
|
|
672
683
|
_patch_layer_norm_module(layer.layer_norm1)
|
|
673
684
|
_patch_layer_norm_module(layer.layer_norm2)
|
|
674
685
|
|
|
675
|
-
language_model
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
686
|
+
language_model = model.language_model
|
|
687
|
+
|
|
688
|
+
if isinstance(language_model, GemmaForCausalLM):
|
|
689
|
+
apply_liger_kernel_to_gemma(
|
|
690
|
+
rope=rope,
|
|
691
|
+
cross_entropy=False,
|
|
692
|
+
fused_linear_cross_entropy=False,
|
|
693
|
+
rms_norm=rms_norm,
|
|
694
|
+
geglu=geglu,
|
|
695
|
+
model=language_model,
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
elif isinstance(language_model, Gemma2ForCausalLM):
|
|
699
|
+
apply_liger_kernel_to_gemma2(
|
|
700
|
+
rope=rope,
|
|
701
|
+
cross_entropy=False,
|
|
702
|
+
fused_linear_cross_entropy=False,
|
|
703
|
+
rms_norm=rms_norm,
|
|
704
|
+
geglu=geglu,
|
|
705
|
+
model=language_model,
|
|
706
|
+
)
|
|
707
|
+
else:
|
|
708
|
+
raise TypeError(
|
|
709
|
+
"The language_model of a PaliGemma model must be either GemmaForCausalLM or Gemma2ForCausalLM."
|
|
710
|
+
)
|
|
685
711
|
|
|
686
712
|
|
|
687
713
|
def apply_liger_kernel_to_qwen2(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: liger_kernel_nightly
|
|
3
|
-
Version: 0.5.5.
|
|
3
|
+
Version: 0.5.5.dev20250320214749
|
|
4
4
|
Summary: Efficient Triton kernels for LLM Training
|
|
5
5
|
License: BSD 2-CLAUSE LICENSE
|
|
6
6
|
Copyright 2024 LinkedIn Corporation
|
|
@@ -113,6 +113,7 @@ Requires-Dist: mkdocs-material; extra == "dev"
|
|
|
113
113
|
<details>
|
|
114
114
|
<summary>Latest News 🔥</summary>
|
|
115
115
|
|
|
116
|
+
- [2025/03/06] We release a joint blog post on TorchTune × Liger - [Peak Performance, Minimized Memory: Optimizing torchtune’s performance with torch.compile & Liger Kernel](https://pytorch.org/blog/peak-performance-minimized-memory/)
|
|
116
117
|
- [2024/12/11] We release [v0.5.0](https://github.com/linkedin/Liger-Kernel/releases/tag/v0.5.0): 80% more memory efficient post training losses (DPO, ORPO, CPO, etc)!
|
|
117
118
|
- [2024/12/5] We release LinkedIn Engineering Blog - [Liger-Kernel: Empowering an open source ecosystem of Triton Kernels for Efficient LLM Training](https://www.linkedin.com/blog/engineering/open-source/liger-kernel-open-source-ecosystem-for-efficient-llm-training)
|
|
118
119
|
- [2024/11/6] We release [v0.4.0](https://github.com/linkedin/Liger-Kernel/releases/tag/v0.4.0): Full AMD support, Tech Report, Modal CI, Llama-3.2-Vision!
|
|
@@ -64,6 +64,10 @@ except ImportError:
|
|
|
64
64
|
MLLAMA_AVAILABLE = False
|
|
65
65
|
|
|
66
66
|
try:
|
|
67
|
+
import transformers
|
|
68
|
+
|
|
69
|
+
from packaging import version
|
|
70
|
+
from transformers.models.gemma.configuration_gemma import GemmaConfig
|
|
67
71
|
from transformers.models.gemma.tokenization_gemma_fast import GemmaTokenizerFast
|
|
68
72
|
from transformers.models.gemma2.configuration_gemma2 import Gemma2Config
|
|
69
73
|
from transformers.models.paligemma.configuration_paligemma import PaliGemmaConfig
|
|
@@ -72,7 +76,7 @@ try:
|
|
|
72
76
|
from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
|
|
73
77
|
from transformers.models.siglip.image_processing_siglip import SiglipImageProcessor
|
|
74
78
|
|
|
75
|
-
PALIGEMMA_AVAILABLE =
|
|
79
|
+
PALIGEMMA_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.46.0")
|
|
76
80
|
except ImportError:
|
|
77
81
|
PALIGEMMA_AVAILABLE = False
|
|
78
82
|
|
|
@@ -152,6 +156,55 @@ if MLLAMA_AVAILABLE:
|
|
|
152
156
|
|
|
153
157
|
if PALIGEMMA_AVAILABLE:
|
|
154
158
|
MINI_MODEL_SETUPS["mini_paligemma"] = MiniModelConfig(
|
|
159
|
+
liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_paligemma, fused_linear_cross_entropy=False),
|
|
160
|
+
liger_kernel_patch_revert_func=revert_liger_kernel_to_Paligemma,
|
|
161
|
+
model_class=PaliGemmaForConditionalGeneration,
|
|
162
|
+
mini_model_config=PaliGemmaConfig(
|
|
163
|
+
vision_config=SiglipVisionConfig(
|
|
164
|
+
attention_dropout=0.0,
|
|
165
|
+
hidden_act="gelu_pytorch_tanh",
|
|
166
|
+
hidden_size=1152,
|
|
167
|
+
image_size=224,
|
|
168
|
+
intermediate_size=2048, # 4304
|
|
169
|
+
layer_norm_eps=1e-06,
|
|
170
|
+
num_attention_heads=4, # 16
|
|
171
|
+
num_channels=3,
|
|
172
|
+
num_hidden_layers=4, # 27
|
|
173
|
+
num_image_tokens=256,
|
|
174
|
+
num_positions=256,
|
|
175
|
+
patch_size=14,
|
|
176
|
+
projection_dim=1024, # 2304
|
|
177
|
+
),
|
|
178
|
+
text_config=GemmaConfig(
|
|
179
|
+
vocab_size=32000, # 256000
|
|
180
|
+
hidden_size=1024, # 3072
|
|
181
|
+
intermediate_size=2048, # 24576
|
|
182
|
+
num_hidden_layers=4, # 28
|
|
183
|
+
num_attention_heads=4, # 16
|
|
184
|
+
num_key_value_heads=4, # 16
|
|
185
|
+
head_dim=256,
|
|
186
|
+
hidden_activation="gelu_pytorch_tanh",
|
|
187
|
+
max_position_embeddings=8192,
|
|
188
|
+
initializer_range=0.02,
|
|
189
|
+
rms_norm_eps=1e-06,
|
|
190
|
+
use_cache=True,
|
|
191
|
+
pad_token_id=0,
|
|
192
|
+
# Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
|
|
193
|
+
# https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
|
|
194
|
+
bos_token_id=1, # 128000
|
|
195
|
+
eos_token_id=2, # 128001
|
|
196
|
+
tie_word_embeddings=True,
|
|
197
|
+
rope_theta=10000.0,
|
|
198
|
+
attention_bias=False,
|
|
199
|
+
attention_dropout=0.0,
|
|
200
|
+
),
|
|
201
|
+
image_token_index=4, # NOTE: outside the vocab size
|
|
202
|
+
attn_implementation="eager",
|
|
203
|
+
vocab_size=32000,
|
|
204
|
+
projection_dim=1024,
|
|
205
|
+
),
|
|
206
|
+
)
|
|
207
|
+
MINI_MODEL_SETUPS["mini_paligemma2"] = MiniModelConfig(
|
|
155
208
|
liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_paligemma, fused_linear_cross_entropy=False),
|
|
156
209
|
liger_kernel_patch_revert_func=revert_liger_kernel_to_Paligemma,
|
|
157
210
|
model_class=PaliGemmaForConditionalGeneration,
|
|
@@ -297,7 +350,7 @@ if QWEN2_5_VL_AVAILABLE:
|
|
|
297
350
|
)
|
|
298
351
|
|
|
299
352
|
|
|
300
|
-
def create_processor(model_name):
|
|
353
|
+
def create_processor(model_name: str):
|
|
301
354
|
if model_name == "mini_qwen2_vl":
|
|
302
355
|
tokenizer_config = load_tokenizer_config(
|
|
303
356
|
os.path.join(FAKE_CONFIGS_PATH, "Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json")
|
|
@@ -352,7 +405,7 @@ def create_processor(model_name):
|
|
|
352
405
|
image_processor = MllamaImageProcessor(size={"height": 560, "width": 560})
|
|
353
406
|
return MllamaProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
|
|
354
407
|
|
|
355
|
-
elif model_name
|
|
408
|
+
elif model_name.startswith("mini_paligemma"):
|
|
356
409
|
tokenizer_config = load_tokenizer_config(
|
|
357
410
|
os.path.join(
|
|
358
411
|
FAKE_CONFIGS_PATH,
|
|
@@ -580,6 +633,25 @@ def run_mini_model_multimodal(
|
|
|
580
633
|
),
|
|
581
634
|
],
|
|
582
635
|
),
|
|
636
|
+
pytest.param(
|
|
637
|
+
"mini_paligemma2",
|
|
638
|
+
32,
|
|
639
|
+
1e-4,
|
|
640
|
+
torch.bfloat16,
|
|
641
|
+
1e-3,
|
|
642
|
+
1e-2,
|
|
643
|
+
1e-1,
|
|
644
|
+
1e-2,
|
|
645
|
+
1e-2,
|
|
646
|
+
1e-2,
|
|
647
|
+
marks=[
|
|
648
|
+
pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
|
|
649
|
+
pytest.mark.skipif(
|
|
650
|
+
not PALIGEMMA_AVAILABLE,
|
|
651
|
+
reason="Paligemma2 not available in this version of transformers",
|
|
652
|
+
),
|
|
653
|
+
],
|
|
654
|
+
),
|
|
583
655
|
],
|
|
584
656
|
)
|
|
585
657
|
def test_mini_model_multimodal(
|
|
@@ -63,6 +63,10 @@ except ImportError:
|
|
|
63
63
|
MLLAMA_AVAILABLE = False
|
|
64
64
|
|
|
65
65
|
try:
|
|
66
|
+
import transformers
|
|
67
|
+
|
|
68
|
+
from packaging import version
|
|
69
|
+
from transformers.models.gemma.configuration_gemma import GemmaConfig
|
|
66
70
|
from transformers.models.gemma.tokenization_gemma_fast import GemmaTokenizerFast
|
|
67
71
|
from transformers.models.gemma2.configuration_gemma2 import Gemma2Config
|
|
68
72
|
from transformers.models.paligemma.configuration_paligemma import PaliGemmaConfig
|
|
@@ -71,7 +75,7 @@ try:
|
|
|
71
75
|
from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
|
|
72
76
|
from transformers.models.siglip.image_processing_siglip import SiglipImageProcessor
|
|
73
77
|
|
|
74
|
-
PALIGEMMA_AVAILABLE =
|
|
78
|
+
PALIGEMMA_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.46.0")
|
|
75
79
|
except ImportError:
|
|
76
80
|
PALIGEMMA_AVAILABLE = False
|
|
77
81
|
|
|
@@ -151,6 +155,56 @@ if MLLAMA_AVAILABLE:
|
|
|
151
155
|
|
|
152
156
|
if PALIGEMMA_AVAILABLE:
|
|
153
157
|
MINI_MODEL_SETUPS["mini_paligemma"] = MiniModelConfig(
|
|
158
|
+
liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_paligemma, fused_linear_cross_entropy=False),
|
|
159
|
+
liger_kernel_patch_revert_func=revert_liger_kernel_to_Paligemma,
|
|
160
|
+
model_class=PaliGemmaForConditionalGeneration,
|
|
161
|
+
mini_model_config=PaliGemmaConfig(
|
|
162
|
+
vision_config=SiglipVisionConfig(
|
|
163
|
+
attention_dropout=0.0,
|
|
164
|
+
hidden_act="gelu_pytorch_tanh",
|
|
165
|
+
hidden_size=1152,
|
|
166
|
+
image_size=224,
|
|
167
|
+
intermediate_size=2048, # 4304
|
|
168
|
+
layer_norm_eps=1e-06,
|
|
169
|
+
num_attention_heads=4, # 16
|
|
170
|
+
num_channels=3,
|
|
171
|
+
num_hidden_layers=4, # 27
|
|
172
|
+
num_image_tokens=256,
|
|
173
|
+
num_positions=256,
|
|
174
|
+
patch_size=14,
|
|
175
|
+
projection_dim=1024, # 2304
|
|
176
|
+
),
|
|
177
|
+
text_config=GemmaConfig(
|
|
178
|
+
vocab_size=32000, # 256000
|
|
179
|
+
hidden_size=1024, # 3072
|
|
180
|
+
intermediate_size=2048, # 24576
|
|
181
|
+
num_hidden_layers=4, # 28
|
|
182
|
+
num_attention_heads=4, # 16
|
|
183
|
+
num_key_value_heads=4, # 16
|
|
184
|
+
head_dim=256,
|
|
185
|
+
hidden_activation="gelu_pytorch_tanh",
|
|
186
|
+
max_position_embeddings=8192,
|
|
187
|
+
initializer_range=0.02,
|
|
188
|
+
rms_norm_eps=1e-06,
|
|
189
|
+
use_cache=True,
|
|
190
|
+
pad_token_id=0,
|
|
191
|
+
# Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
|
|
192
|
+
# https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
|
|
193
|
+
bos_token_id=1, # 128000
|
|
194
|
+
eos_token_id=2, # 128001
|
|
195
|
+
tie_word_embeddings=True,
|
|
196
|
+
rope_theta=10000.0,
|
|
197
|
+
attention_bias=False,
|
|
198
|
+
attention_dropout=0.0,
|
|
199
|
+
),
|
|
200
|
+
image_token_index=4, # NOTE: outside the vocab size
|
|
201
|
+
attn_implementation="eager",
|
|
202
|
+
vocab_size=32000,
|
|
203
|
+
projection_dim=1024,
|
|
204
|
+
),
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
MINI_MODEL_SETUPS["mini_paligemma2"] = MiniModelConfig(
|
|
154
208
|
liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_paligemma, fused_linear_cross_entropy=False),
|
|
155
209
|
liger_kernel_patch_revert_func=revert_liger_kernel_to_Paligemma,
|
|
156
210
|
model_class=PaliGemmaForConditionalGeneration,
|
|
@@ -200,6 +254,7 @@ if PALIGEMMA_AVAILABLE:
|
|
|
200
254
|
),
|
|
201
255
|
)
|
|
202
256
|
|
|
257
|
+
|
|
203
258
|
if QWEN2_VL_AVAILABLE:
|
|
204
259
|
MINI_MODEL_SETUPS["mini_qwen2_vl"] = MiniModelConfig(
|
|
205
260
|
liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_qwen2_vl, fused_linear_cross_entropy=False),
|
|
@@ -295,7 +350,7 @@ if QWEN2_5_VL_AVAILABLE:
|
|
|
295
350
|
)
|
|
296
351
|
|
|
297
352
|
|
|
298
|
-
def create_processor(model_name):
|
|
353
|
+
def create_processor(model_name: str):
|
|
299
354
|
if model_name == "mini_qwen2_vl":
|
|
300
355
|
tokenizer_config = load_tokenizer_config(
|
|
301
356
|
os.path.join(FAKE_CONFIGS_PATH, "Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json")
|
|
@@ -350,7 +405,7 @@ def create_processor(model_name):
|
|
|
350
405
|
image_processor = MllamaImageProcessor(size={"height": 560, "width": 560})
|
|
351
406
|
return MllamaProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
|
|
352
407
|
|
|
353
|
-
elif model_name
|
|
408
|
+
elif model_name.startswith("mini_paligemma"):
|
|
354
409
|
tokenizer_config = load_tokenizer_config(
|
|
355
410
|
os.path.join(
|
|
356
411
|
FAKE_CONFIGS_PATH,
|
|
@@ -569,6 +624,22 @@ def run_mini_model_multimodal(
|
|
|
569
624
|
reason="Paligemma not available in this version of transformers",
|
|
570
625
|
),
|
|
571
626
|
),
|
|
627
|
+
pytest.param(
|
|
628
|
+
"mini_paligemma2",
|
|
629
|
+
32,
|
|
630
|
+
1e-4,
|
|
631
|
+
torch.float32,
|
|
632
|
+
1e-8,
|
|
633
|
+
1e-5,
|
|
634
|
+
5e-3,
|
|
635
|
+
1e-5,
|
|
636
|
+
5e-3,
|
|
637
|
+
1e-5,
|
|
638
|
+
marks=pytest.mark.skipif(
|
|
639
|
+
not PALIGEMMA_AVAILABLE,
|
|
640
|
+
reason="Paligemma2 not available in this version of transformers",
|
|
641
|
+
),
|
|
642
|
+
),
|
|
572
643
|
],
|
|
573
644
|
)
|
|
574
645
|
def test_mini_model_multimodal(
|
|
@@ -314,13 +314,15 @@ def revert_liger_kernel_to_Paligemma(model_config: MiniModelConfig):
|
|
|
314
314
|
Revert all Liger kernel patches applied to Gemma2.
|
|
315
315
|
"""
|
|
316
316
|
|
|
317
|
+
from transformers.models.gemma import modeling_gemma
|
|
317
318
|
from transformers.models.gemma2 import modeling_gemma2
|
|
318
319
|
from transformers.models.paligemma import modeling_paligemma
|
|
319
320
|
from transformers.models.siglip import modeling_siglip
|
|
320
321
|
|
|
321
|
-
importlib.reload(
|
|
322
|
+
importlib.reload(modeling_gemma)
|
|
322
323
|
importlib.reload(modeling_gemma2)
|
|
323
324
|
importlib.reload(modeling_paligemma)
|
|
325
|
+
importlib.reload(modeling_siglip)
|
|
324
326
|
model_config.model_class = modeling_paligemma.PaliGemmaForConditionalGeneration
|
|
325
327
|
print("Liger kernel patches have been reverted.")
|
|
326
328
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{liger_kernel_nightly-0.5.5.dev20250318140935 → liger_kernel_nightly-0.5.5.dev20250320214749}/NOTICE
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|