liger-kernel-nightly 0.6.2.dev20250822000312__tar.gz → 0.6.2.dev20250822031344__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/PKG-INFO +1 -1
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/pyproject.toml +1 -1
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/fused_linear_cross_entropy.py +41 -1
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/functional.py +2 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +3 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel_nightly.egg-info/PKG-INFO +1 -1
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_fused_linear_cross_entropy.py +226 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_monkey_patch.py +5 -17
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/.github/pull_request_template.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/.github/workflows/amd-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/.github/workflows/benchmark.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/.github/workflows/docs.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/.github/workflows/intel-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/.github/workflows/nvi-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/.github/workflows/publish-nightly.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/.github/workflows/publish-release.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/.gitignore +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/LICENSE +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/Makefile +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/NOTICE +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/benchmarks_visualizer.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/data/all_benchmark_data.csv +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/benchmark_tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/benchmark/scripts/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/dev/fmt-requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/dev/modal/benchmarks.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/dev/modal/tests.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/dev/modal/tests_bwd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/Examples.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/Getting-Started.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/High-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/Low-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/acknowledgement.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/contributing.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/images/banner.GIF +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/images/compose.gif +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/images/e2e-memory.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/images/e2e-tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/images/logo-banner.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/images/patch.gif +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/images/post-training.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/index.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/docs/license.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/alignment/accelerate_config.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/alignment/run_orpo.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/callback.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/config/fsdp_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/img/gemma_7b_mem.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/img/gemma_7b_tp.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/img/llama_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/img/llama_tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/img/qwen_tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/launch_on_modal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/run_benchmarks.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/run_gemma.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/run_llama.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/run_qwen.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/run_qwen2_vl.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/training.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/huggingface/training_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/lightning/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/lightning/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/lightning/training.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/callback.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/medusa_util.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/examples/medusa/train.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/licenses/LICENSE-Apache-2.0 +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/licenses/LICENSE-MIT-AutoAWQ +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/licenses/LICENSE-MIT-llmc +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/licenses/LICENSE-MIT-triton +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/mkdocs.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/setup.cfg +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/setup.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/functional.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/env_report.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/ops/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/auto_model.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/experimental/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/fsdp.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/gemma.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/gemma2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/gemma3.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/glm4.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/glm4v.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/llama.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/llama4.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/llava.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/mistral.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/mixtral.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/mllama.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/olmo2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/paligemma.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/phi3.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/qwen2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/qwen3.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/model/smollm3.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/trainer_integration.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/transformers/tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/triton/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/triton/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel_nightly.egg-info/SOURCES.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/chunked_loss/test_cosine_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/chunked_loss/test_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/chunked_loss/test_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/chunked_loss/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/chunked_loss/test_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/chunked_loss/test_kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/chunked_loss/test_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/chunked_loss/test_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/conftest.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/convergence/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/convergence/bf16/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/convergence/bf16/test_mini_models.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/convergence/bf16/test_mini_models_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/convergence/bf16/test_mini_models_with_logits.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/convergence/fp32/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/convergence/fp32/test_mini_models.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/convergence/fp32/test_mini_models_with_logits.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/resources/tiny_shakespeare.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_auto_model.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_flex_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_trainer_integration.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_transformers.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/transformers/test_tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/triton/test_triton_monkey_patch.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/test/utils.py +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "liger_kernel_nightly"
|
7
|
-
version = "0.6.2.
|
7
|
+
version = "0.6.2.dev20250822031344"
|
8
8
|
description = "Efficient Triton kernels for LLM Training"
|
9
9
|
urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
@@ -26,6 +26,7 @@ def fused_linear_cross_entropy_forward(
|
|
26
26
|
softcap=None,
|
27
27
|
return_z_loss=False,
|
28
28
|
accum_dtype=None,
|
29
|
+
use_token_scaling=False,
|
29
30
|
):
|
30
31
|
assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
|
31
32
|
device = _input.device
|
@@ -89,6 +90,23 @@ def fused_linear_cross_entropy_forward(
|
|
89
90
|
|
90
91
|
n_rows = logits_chunk.shape[0]
|
91
92
|
|
93
|
+
# Compute predicted probabilities for token scaling if needed
|
94
|
+
if use_token_scaling:
|
95
|
+
# Compute softmax probabilities for scaling
|
96
|
+
# We need to compute this before the cross entropy kernel modifies logits_chunk
|
97
|
+
logits_for_softmax = logits_chunk.detach().clone() # Detach to avoid gradient flow
|
98
|
+
if softcap is not None:
|
99
|
+
logits_for_softmax = softcap * torch.tanh(logits_for_softmax / softcap)
|
100
|
+
|
101
|
+
# Compute softmax to get predicted probabilities
|
102
|
+
probs = torch.softmax(logits_for_softmax, dim=-1)
|
103
|
+
|
104
|
+
# Get the predicted probability for each target token
|
105
|
+
pred_probs = torch.gather(probs, -1, target_chunk.unsqueeze(-1)).squeeze(-1)
|
106
|
+
|
107
|
+
# Store the scaling factors
|
108
|
+
scaling_factors = pred_probs.detach() # Detach to ensure no gradient flow
|
109
|
+
|
92
110
|
# unreduced loss
|
93
111
|
loss_1d_slice = loss_1d[start_idx:end_idx] # chunk_size,
|
94
112
|
z_loss_1d_slice = z_loss_1d[start_idx:end_idx] if return_z_loss else None
|
@@ -123,11 +141,23 @@ def fused_linear_cross_entropy_forward(
|
|
123
141
|
num_warps=32 if not is_hip() else 16,
|
124
142
|
)
|
125
143
|
|
144
|
+
# Apply token scaling if requested
|
145
|
+
if use_token_scaling:
|
146
|
+
loss_1d_slice = loss_1d_slice * scaling_factors
|
147
|
+
if return_z_loss:
|
148
|
+
z_loss_1d_slice = z_loss_1d_slice * scaling_factors
|
149
|
+
|
126
150
|
loss_1d[start_idx:end_idx] = loss_1d_slice
|
127
151
|
if return_z_loss:
|
128
152
|
z_loss_1d[start_idx:end_idx] = z_loss_1d_slice
|
129
153
|
grad_logits_chunk = logits_chunk # chunk_size x V
|
130
154
|
|
155
|
+
# Apply token scaling to gradients if requested
|
156
|
+
if use_token_scaling:
|
157
|
+
# Expand scaling factors to match gradient dimensions
|
158
|
+
scaling_factors_expanded = scaling_factors.unsqueeze(-1) # chunk_size x 1
|
159
|
+
grad_logits_chunk = grad_logits_chunk * scaling_factors_expanded
|
160
|
+
|
131
161
|
grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
|
132
162
|
|
133
163
|
if grad_weight is not None:
|
@@ -136,7 +166,7 @@ def fused_linear_cross_entropy_forward(
|
|
136
166
|
if bias is not None:
|
137
167
|
torch.add(
|
138
168
|
input=grad_bias,
|
139
|
-
other=
|
169
|
+
other=grad_logits_chunk.sum(dim=0),
|
140
170
|
out=grad_bias,
|
141
171
|
alpha=1.0,
|
142
172
|
)
|
@@ -146,6 +176,10 @@ def fused_linear_cross_entropy_forward(
|
|
146
176
|
# loss = loss_1d
|
147
177
|
# z_loss = z_loss_1d if return_z_loss else None
|
148
178
|
|
179
|
+
if reduction == "none":
|
180
|
+
# Return per-token losses
|
181
|
+
loss = loss_1d
|
182
|
+
z_loss = z_loss_1d if return_z_loss else None
|
149
183
|
else:
|
150
184
|
loss = torch.sum(loss_1d)
|
151
185
|
z_loss = torch.sum(z_loss_1d) if return_z_loss else None
|
@@ -221,6 +255,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
|
|
221
255
|
softcap=None,
|
222
256
|
return_z_loss: bool = False,
|
223
257
|
accum_dtype=None,
|
258
|
+
use_token_scaling: bool = False,
|
224
259
|
):
|
225
260
|
"""
|
226
261
|
Fusing the last linear layer with cross-entropy loss
|
@@ -241,6 +276,9 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
|
|
241
276
|
reduction: reduction to apply
|
242
277
|
accum_dtype (torch.dtype): the dtype of intermediate result buffers for weight and bias gradient accumulations.
|
243
278
|
Recommended to set `accum_dtype` to higher precision, e.g. `torch.float32`, if the training is unstable with original dtype. Default: `None`, performing accumulations in original dtype
|
279
|
+
use_token_scaling (bool): whether to scale each token's loss by its predicted probability (detached).
|
280
|
+
When True, each token's loss is multiplied by the model's predicted probability for that token's true class.
|
281
|
+
Default: False.
|
244
282
|
"""
|
245
283
|
|
246
284
|
loss, z_loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
|
@@ -256,6 +294,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
|
|
256
294
|
softcap=softcap,
|
257
295
|
return_z_loss=return_z_loss,
|
258
296
|
accum_dtype=accum_dtype,
|
297
|
+
use_token_scaling=use_token_scaling,
|
259
298
|
)
|
260
299
|
# downcast to dtype and store for backward
|
261
300
|
ctx.save_for_backward(
|
@@ -288,4 +327,5 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
|
|
288
327
|
None,
|
289
328
|
None,
|
290
329
|
None,
|
330
|
+
None, # use_token_scaling
|
291
331
|
)
|
@@ -65,6 +65,7 @@ def liger_fused_linear_cross_entropy(
|
|
65
65
|
softcap: Optional[float] = None,
|
66
66
|
return_z_loss: bool = False,
|
67
67
|
accum_dtype=None,
|
68
|
+
use_token_scaling: bool = False,
|
68
69
|
):
|
69
70
|
loss, z_loss = LigerFusedLinearCrossEntropyFunction.apply(
|
70
71
|
input,
|
@@ -79,6 +80,7 @@ def liger_fused_linear_cross_entropy(
|
|
79
80
|
softcap,
|
80
81
|
return_z_loss,
|
81
82
|
accum_dtype,
|
83
|
+
use_token_scaling,
|
82
84
|
)
|
83
85
|
if not return_z_loss:
|
84
86
|
return loss
|
@@ -16,6 +16,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
|
|
16
16
|
softcap: Optional[float] = None,
|
17
17
|
return_z_loss: bool = False,
|
18
18
|
accum_dtype: Optional[torch.dtype] = None,
|
19
|
+
use_token_scaling: bool = False,
|
19
20
|
):
|
20
21
|
super().__init__()
|
21
22
|
assert (label_smoothing >= 0) and (label_smoothing <= 1), (
|
@@ -34,6 +35,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
|
|
34
35
|
self.softcap = softcap
|
35
36
|
self.return_z_loss = return_z_loss
|
36
37
|
self.accum_dtype = accum_dtype
|
38
|
+
self.use_token_scaling = use_token_scaling
|
37
39
|
|
38
40
|
def forward(self, lin_weight, _input, target, bias=None):
|
39
41
|
loss, z_loss = LigerFusedLinearCrossEntropyFunction.apply(
|
@@ -49,6 +51,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
|
|
49
51
|
self.softcap,
|
50
52
|
self.return_z_loss,
|
51
53
|
self.accum_dtype,
|
54
|
+
self.use_token_scaling,
|
52
55
|
)
|
53
56
|
if not self.return_z_loss:
|
54
57
|
return loss
|
@@ -352,3 +352,229 @@ def test_amp(B, T, H, V, bias, cast_dtype, accum_dtype, atol, rtol):
|
|
352
352
|
atol=atol,
|
353
353
|
rtol=rtol,
|
354
354
|
)
|
355
|
+
|
356
|
+
|
357
|
+
def test_correctness_token_scaling():
|
358
|
+
"""Test that token scaling produces the correct loss values and gradients."""
|
359
|
+
B, T, H, V = 2, 4, 8, 16
|
360
|
+
dtype = torch.float32
|
361
|
+
|
362
|
+
# Create inputs
|
363
|
+
_input = torch.randn(B * T, H, device=device, dtype=dtype, requires_grad=True)
|
364
|
+
target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
|
365
|
+
|
366
|
+
# Create weights
|
367
|
+
weight = torch.randn(V, H, device=device, dtype=dtype)
|
368
|
+
bias = torch.randn(V, device=device, dtype=dtype)
|
369
|
+
|
370
|
+
# Test using functional API with token scaling
|
371
|
+
loss_scaled = liger_fused_linear_cross_entropy(
|
372
|
+
input=_input,
|
373
|
+
weight=weight,
|
374
|
+
target=target,
|
375
|
+
bias=bias,
|
376
|
+
ignore_index=-100,
|
377
|
+
reduction="none", # Use "none" to get per-token losses
|
378
|
+
use_token_scaling=True,
|
379
|
+
)
|
380
|
+
|
381
|
+
# Compare with manual implementation
|
382
|
+
# Compute logits
|
383
|
+
logits = _input @ weight.t()
|
384
|
+
if bias is not None:
|
385
|
+
logits = logits + bias
|
386
|
+
|
387
|
+
# Compute standard cross entropy loss per token
|
388
|
+
ce_loss = torch.nn.functional.cross_entropy(logits, target, ignore_index=-100, reduction="none")
|
389
|
+
|
390
|
+
# Compute predicted probabilities for target tokens
|
391
|
+
pred_probs = torch.softmax(logits, dim=-1).gather(1, target.unsqueeze(-1)).squeeze(-1).detach()
|
392
|
+
|
393
|
+
# Scale by predicted probabilities
|
394
|
+
expected_loss = ce_loss * pred_probs
|
395
|
+
|
396
|
+
# Check that losses are close
|
397
|
+
assert torch.allclose(loss_scaled, expected_loss, atol=1e-4, rtol=1e-4)
|
398
|
+
|
399
|
+
# Test gradients
|
400
|
+
loss_scaled.sum().backward(retain_graph=True)
|
401
|
+
grad_scaled = _input.grad.clone()
|
402
|
+
_input.grad.zero_()
|
403
|
+
|
404
|
+
expected_loss.sum().backward(retain_graph=True)
|
405
|
+
grad_expected = _input.grad.clone()
|
406
|
+
_input.grad.zero_()
|
407
|
+
|
408
|
+
# Check that gradients are close
|
409
|
+
assert torch.allclose(grad_scaled, grad_expected, atol=1e-4, rtol=1e-4)
|
410
|
+
|
411
|
+
|
412
|
+
def test_correctness_token_scaling_consistency():
|
413
|
+
"""Test that token scaling is consistent between functional and module APIs."""
|
414
|
+
B, T, H, V = 2, 4, 8, 16
|
415
|
+
dtype = torch.float32
|
416
|
+
|
417
|
+
# Create inputs
|
418
|
+
_input = torch.randn(B * T, H, device=device, dtype=dtype, requires_grad=True)
|
419
|
+
target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
|
420
|
+
|
421
|
+
# Create weights
|
422
|
+
weight = torch.randn(V, H, device=device, dtype=dtype)
|
423
|
+
bias = torch.randn(V, device=device, dtype=dtype)
|
424
|
+
|
425
|
+
# Test functional API
|
426
|
+
loss_functional = liger_fused_linear_cross_entropy(
|
427
|
+
input=_input,
|
428
|
+
weight=weight,
|
429
|
+
target=target,
|
430
|
+
bias=bias,
|
431
|
+
ignore_index=-100,
|
432
|
+
reduction="sum",
|
433
|
+
use_token_scaling=True,
|
434
|
+
)
|
435
|
+
|
436
|
+
# Test module API
|
437
|
+
ce_loss_module = LigerFusedLinearCrossEntropyLoss(
|
438
|
+
ignore_index=-100,
|
439
|
+
reduction="sum",
|
440
|
+
use_token_scaling=True,
|
441
|
+
)
|
442
|
+
|
443
|
+
loss_module = ce_loss_module(weight, _input, target, bias)
|
444
|
+
|
445
|
+
# Check that losses are identical
|
446
|
+
assert torch.allclose(loss_functional, loss_module, atol=1e-6, rtol=1e-6)
|
447
|
+
|
448
|
+
# Test gradients
|
449
|
+
loss_functional.backward(retain_graph=True)
|
450
|
+
grad_functional = _input.grad.clone()
|
451
|
+
_input.grad.zero_()
|
452
|
+
|
453
|
+
loss_module.backward(retain_graph=True)
|
454
|
+
grad_module = _input.grad.clone()
|
455
|
+
_input.grad.zero_()
|
456
|
+
|
457
|
+
# Check that gradients are identical
|
458
|
+
assert torch.allclose(grad_functional, grad_module, atol=1e-6, rtol=1e-6)
|
459
|
+
|
460
|
+
|
461
|
+
def test_correctness_token_scaling_functional():
|
462
|
+
"""Test token scaling using the functional API."""
|
463
|
+
B, T, H, V = 2, 4, 8, 16
|
464
|
+
dtype = torch.float32
|
465
|
+
|
466
|
+
# Create inputs
|
467
|
+
_input = torch.randn(B * T, H, device=device, dtype=dtype)
|
468
|
+
x1 = _input.detach().clone().requires_grad_(True)
|
469
|
+
x2 = _input.detach().clone().requires_grad_(True)
|
470
|
+
|
471
|
+
target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
|
472
|
+
|
473
|
+
# Create weights
|
474
|
+
weight = torch.randn(V, H, device=device, dtype=dtype)
|
475
|
+
bias = torch.randn(V, device=device, dtype=dtype)
|
476
|
+
|
477
|
+
# Test using functional API with token scaling
|
478
|
+
y1 = liger_fused_linear_cross_entropy(
|
479
|
+
input=x1,
|
480
|
+
weight=weight,
|
481
|
+
target=target,
|
482
|
+
bias=bias,
|
483
|
+
ignore_index=-100,
|
484
|
+
lse_square_scale=0.0,
|
485
|
+
label_smoothing=0.0,
|
486
|
+
reduction="sum", # Use sum for easier verification
|
487
|
+
softcap=None,
|
488
|
+
return_z_loss=False,
|
489
|
+
accum_dtype=None,
|
490
|
+
use_token_scaling=True,
|
491
|
+
)
|
492
|
+
|
493
|
+
# Compare with manual implementation
|
494
|
+
# Compute logits
|
495
|
+
logits = x2 @ weight.t()
|
496
|
+
if bias is not None:
|
497
|
+
logits = logits + bias
|
498
|
+
|
499
|
+
# Compute softmax probabilities
|
500
|
+
probs = torch.softmax(logits.detach(), dim=-1) # Detach to avoid gradient flow
|
501
|
+
|
502
|
+
# Get predicted probabilities for target tokens
|
503
|
+
pred_probs = torch.gather(probs, -1, target.unsqueeze(-1)).squeeze(-1)
|
504
|
+
|
505
|
+
# Compute standard cross entropy loss
|
506
|
+
ce_loss = torch.nn.functional.cross_entropy(logits, target, ignore_index=-100, reduction="none")
|
507
|
+
|
508
|
+
# Scale by predicted probabilities
|
509
|
+
scaled_loss = ce_loss * pred_probs
|
510
|
+
|
511
|
+
# Sum over all tokens
|
512
|
+
y2 = scaled_loss.sum()
|
513
|
+
|
514
|
+
# Check that losses are close
|
515
|
+
assert torch.allclose(y1, y2, atol=1e-5, rtol=1e-5)
|
516
|
+
|
517
|
+
# Test gradients
|
518
|
+
y1.backward()
|
519
|
+
y2.backward()
|
520
|
+
|
521
|
+
# Check that gradients are close
|
522
|
+
assert torch.allclose(x1.grad, x2.grad, atol=1e-5, rtol=1e-5)
|
523
|
+
|
524
|
+
|
525
|
+
def test_correctness_token_scaling_module():
|
526
|
+
"""Test token scaling using the module API."""
|
527
|
+
B, T, H, V = 2, 4, 8, 16
|
528
|
+
dtype = torch.float32
|
529
|
+
|
530
|
+
# Create inputs
|
531
|
+
_input = torch.randn(B * T, H, device=device, dtype=dtype)
|
532
|
+
x1 = _input.detach().clone().requires_grad_(True)
|
533
|
+
x2 = _input.detach().clone().requires_grad_(True)
|
534
|
+
|
535
|
+
target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
|
536
|
+
|
537
|
+
# Create module with token scaling
|
538
|
+
ce_loss = LigerFusedLinearCrossEntropyLoss(
|
539
|
+
ignore_index=-100,
|
540
|
+
reduction="sum",
|
541
|
+
use_token_scaling=True,
|
542
|
+
)
|
543
|
+
|
544
|
+
# Create weights
|
545
|
+
weight = torch.randn(V, H, device=device, dtype=dtype)
|
546
|
+
bias = torch.randn(V, device=device, dtype=dtype)
|
547
|
+
|
548
|
+
# Test using module API with token scaling
|
549
|
+
y1 = ce_loss(weight, x1, target, bias)
|
550
|
+
|
551
|
+
# Compare with manual implementation
|
552
|
+
# Compute logits
|
553
|
+
logits = x2 @ weight.t()
|
554
|
+
if bias is not None:
|
555
|
+
logits = logits + bias
|
556
|
+
|
557
|
+
# Compute softmax probabilities
|
558
|
+
probs = torch.softmax(logits.detach(), dim=-1) # Detach to avoid gradient flow
|
559
|
+
|
560
|
+
# Get predicted probabilities for target tokens
|
561
|
+
pred_probs = torch.gather(probs, -1, target.unsqueeze(-1)).squeeze(-1)
|
562
|
+
|
563
|
+
# Compute standard cross entropy loss
|
564
|
+
ce_loss_manual = torch.nn.functional.cross_entropy(logits, target, ignore_index=-100, reduction="none")
|
565
|
+
|
566
|
+
# Scale by predicted probabilities
|
567
|
+
scaled_loss = ce_loss_manual * pred_probs
|
568
|
+
|
569
|
+
# Sum over all tokens
|
570
|
+
y2 = scaled_loss.sum()
|
571
|
+
|
572
|
+
# Check that losses are close
|
573
|
+
assert torch.allclose(y1, y2, atol=1e-5, rtol=1e-5)
|
574
|
+
|
575
|
+
# Test gradients
|
576
|
+
y1.backward()
|
577
|
+
y2.backward()
|
578
|
+
|
579
|
+
# Check that gradients are close
|
580
|
+
assert torch.allclose(x1.grad, x2.grad, atol=1e-5, rtol=1e-5)
|
@@ -1617,38 +1617,26 @@ def test_apply_liger_kernel_to_instance_for_glm4():
|
|
1617
1617
|
|
1618
1618
|
# Check that model instance variables are not yet patched with Liger modules
|
1619
1619
|
assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(glm4_lce_forward)
|
1620
|
-
assert inspect.getsource(dummy_model_instance.
|
1621
|
-
|
1622
|
-
)
|
1623
|
-
for layer in dummy_model_instance.language_model.layers:
|
1620
|
+
assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
|
1621
|
+
for layer in dummy_model_instance.model.layers:
|
1624
1622
|
assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerPhi3SwiGLUMLP.forward)
|
1625
1623
|
assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
|
1626
1624
|
assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
|
1627
1625
|
assert inspect.getsource(layer.post_self_attn_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
|
1628
1626
|
assert inspect.getsource(layer.post_mlp_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
|
1629
|
-
for vision_block in dummy_model_instance.visual.blocks:
|
1630
|
-
assert inspect.getsource(vision_block.norm1.forward) != inspect.getsource(LigerRMSNorm.forward)
|
1631
|
-
assert inspect.getsource(vision_block.norm2.forward) != inspect.getsource(LigerRMSNorm.forward)
|
1632
|
-
assert inspect.getsource(vision_block.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
|
1633
1627
|
|
1634
1628
|
# Test applying kernels to the model instance
|
1635
1629
|
_apply_liger_kernel_to_instance(model=dummy_model_instance)
|
1636
1630
|
|
1637
1631
|
# Check that the model's instance variables were correctly patched with Liger modules
|
1638
|
-
assert inspect.getsource(dummy_model_instance.forward)
|
1639
|
-
assert inspect.getsource(dummy_model_instance.
|
1640
|
-
|
1641
|
-
)
|
1642
|
-
for layer in dummy_model_instance.language_model.layers:
|
1632
|
+
assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(glm4_lce_forward)
|
1633
|
+
assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
|
1634
|
+
for layer in dummy_model_instance.model.layers:
|
1643
1635
|
assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerPhi3SwiGLUMLP.forward)
|
1644
1636
|
assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
|
1645
1637
|
assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
|
1646
1638
|
assert inspect.getsource(layer.post_self_attn_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
|
1647
1639
|
assert inspect.getsource(layer.post_mlp_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
|
1648
|
-
for vision_block in dummy_model_instance.visual.blocks:
|
1649
|
-
assert inspect.getsource(vision_block.norm1.forward) == inspect.getsource(LigerRMSNorm.forward)
|
1650
|
-
assert inspect.getsource(vision_block.norm2.forward) == inspect.getsource(LigerRMSNorm.forward)
|
1651
|
-
assert inspect.getsource(vision_block.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
|
1652
1640
|
|
1653
1641
|
try:
|
1654
1642
|
print(dummy_model_instance)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{liger_kernel_nightly-0.6.2.dev20250822000312 → liger_kernel_nightly-0.6.2.dev20250822031344}/NOTICE
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|