liger-kernel-nightly 0.6.2.dev20250903164350__tar.gz → 0.6.2.dev20250905160847__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/PKG-INFO +1 -1
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/data/all_benchmark_data.csv +64 -0
- liger_kernel_nightly-0.6.2.dev20250905160847/benchmark/scripts/benchmark_grpo_loss.py +234 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/pyproject.toml +1 -1
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +4 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/grpo_loss.py +38 -4
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/__init__.py +3 -0
- liger_kernel_nightly-0.6.2.dev20250905160847/src/liger_kernel/transformers/model/glm4v_moe.py +152 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/monkey_patch.py +102 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel_nightly.egg-info/PKG-INFO +1 -1
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel_nightly.egg-info/SOURCES.txt +2 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/chunked_loss/test_grpo_loss.py +35 -4
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/convergence/bf16/test_mini_models.py +107 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/convergence/bf16/test_mini_models_multimodal.py +1 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/convergence/bf16/test_mini_models_with_logits.py +107 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/convergence/fp32/test_mini_models.py +104 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/convergence/fp32/test_mini_models_with_logits.py +103 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_monkey_patch.py +118 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/utils.py +12 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/.github/pull_request_template.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/.github/workflows/amd-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/.github/workflows/benchmark.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/.github/workflows/docs.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/.github/workflows/intel-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/.github/workflows/nvi-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/.github/workflows/publish-nightly.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/.github/workflows/publish-release.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/.gitignore +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/LICENSE +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/Makefile +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/NOTICE +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/benchmarks_visualizer.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/benchmark_tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/benchmark/scripts/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/dev/fmt-requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/dev/modal/benchmarks.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/dev/modal/tests.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/dev/modal/tests_bwd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/Examples.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/Getting-Started.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/High-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/Low-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/acknowledgement.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/contributing.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/images/banner.GIF +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/images/compose.gif +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/images/e2e-memory.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/images/e2e-tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/images/logo-banner.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/images/patch.gif +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/images/post-training.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/index.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/docs/license.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/alignment/accelerate_config.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/alignment/run_orpo.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/callback.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/config/fsdp_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/img/gemma_7b_mem.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/img/gemma_7b_tp.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/img/llama_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/img/llama_tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/img/qwen_tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/launch_on_modal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/run_benchmarks.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/run_gemma.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/run_llama.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/run_qwen.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/run_qwen2_vl.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/training.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/huggingface/training_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/lightning/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/lightning/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/lightning/training.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/callback.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/medusa_util.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/examples/medusa/train.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/licenses/LICENSE-Apache-2.0 +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/licenses/LICENSE-MIT-AutoAWQ +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/licenses/LICENSE-MIT-llmc +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/licenses/LICENSE-MIT-triton +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/mkdocs.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/setup.cfg +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/setup.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/functional.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/env_report.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/ops/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/auto_model.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/experimental/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/fsdp.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/functional.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/gemma.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/gemma2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/gemma3.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/glm4.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/glm4v.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/llama.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/llama4.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/llava.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/mistral.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/mixtral.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/mllama.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/olmo2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/paligemma.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/phi3.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/qwen2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/qwen3.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/model/smollm3.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/trainer_integration.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/transformers/tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/triton/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/triton/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/chunked_loss/test_cosine_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/chunked_loss/test_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/chunked_loss/test_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/chunked_loss/test_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/chunked_loss/test_kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/chunked_loss/test_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/chunked_loss/test_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/conftest.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/convergence/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/convergence/bf16/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/convergence/fp32/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/resources/tiny_shakespeare.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_auto_model.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_flex_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_trainer_integration.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_transformers.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/transformers/test_tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20250903164350 → liger_kernel_nightly-0.6.2.dev20250905160847}/test/triton/test_triton_monkey_patch.py +0 -0
@@ -1575,6 +1575,70 @@ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,4096,416.11767578
|
|
1575
1575
|
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,8192,832.22705078125,832.22705078125,832.22705078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
1576
1576
|
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,16384,1544.44580078125,1544.44580078125,1544.44580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
1577
1577
|
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
1578
|
+
fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,2,40.75366401672363,40.749671173095706,40.75765686035156,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
|
1579
|
+
fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,4,80.95231628417969,80.95231628417969,80.95231628417969,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
|
1580
|
+
fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,8,163.58604431152344,163.58604431152344,163.58604431152344,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
|
1581
|
+
fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,16,323.6761474609375,323.6761474609375,323.6761474609375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
|
1582
|
+
fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,2,23.71225643157959,23.612825775146483,23.8354434967041,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
|
1583
|
+
fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,4,46.86131286621094,46.80355911254883,46.91906661987304,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
|
1584
|
+
fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,8,94.54898834228516,94.54898834228516,94.54898834228516,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
|
1585
|
+
fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,16,189.99501037597656,189.99501037597656,189.99501037597656,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
|
1586
|
+
fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,2,42.67263984680176,42.54085083007813,42.80442886352539,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
|
1587
|
+
fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,4,82.2446060180664,82.2446060180664,82.2446060180664,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
|
1588
|
+
fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,8,167.00416564941406,167.00416564941406,167.00416564941406,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
|
1589
|
+
fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,16,327.0911865234375,327.0911865234375,327.0911865234375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
|
1590
|
+
fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,2,45.36115264892578,45.241344451904304,45.480960845947266,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
|
1591
|
+
fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,4,90.00038146972656,90.00038146972656,90.00038146972656,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
|
1592
|
+
fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,8,177.22674560546875,177.22674560546875,177.22674560546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
|
1593
|
+
fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,16,356.5383605957031,356.5383605957031,356.5383605957031,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
|
1594
|
+
fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,2,1.814527988433838,1.8124799728393555,1.8167808055877686,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
|
1595
|
+
fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,4,1.84934401512146,1.8472959995269775,1.8524160385131836,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
|
1596
|
+
fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,8,1.891327977180481,1.8872319459915161,1.893990397453308,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
|
1597
|
+
fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,16,1.9722239971160889,1.9660799503326416,1.9763200283050537,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
|
1598
|
+
fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,2,22.014975547790527,21.710438537597657,22.19417533874512,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
|
1599
|
+
fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,4,41.83603096008301,41.752165222167974,41.91989669799805,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
|
1600
|
+
fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,8,81.66400146484375,81.66400146484375,81.66400146484375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
|
1601
|
+
fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,16,162.6429443359375,162.6429443359375,162.6429443359375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
|
1602
|
+
fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,2,7344.77685546875,7344.77685546875,7344.77685546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
|
1603
|
+
fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,4,7408.80029296875,7408.80029296875,7408.80029296875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
|
1604
|
+
fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,8,7536.84716796875,7536.84716796875,7536.84716796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
|
1605
|
+
fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,16,7792.94091796875,7792.94091796875,7792.94091796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
|
1606
|
+
fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,2,9083.28125,9083.28125,9083.28125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
|
1607
|
+
fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,4,13138.3125,13138.3125,13138.3125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
|
1608
|
+
fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,8,21250.375,21250.375,21250.375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
|
1609
|
+
fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,16,37474.5,37474.5,37474.5,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
|
1610
|
+
fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,2,40.72038269042969,40.71178131103516,40.728984069824214,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
|
1611
|
+
fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,4,81.69369506835938,81.69369506835938,81.69369506835938,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
|
1612
|
+
fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,8,162.79653930664062,162.79653930664062,162.79653930664062,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
|
1613
|
+
fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,16,323.6546630859375,323.6546630859375,323.6546630859375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
|
1614
|
+
fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,2,23.70047950744629,23.628594589233398,23.732429122924806,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
|
1615
|
+
fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,4,47.36921691894531,47.085364532470706,47.65306930541992,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
|
1616
|
+
fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,8,94.83366394042969,94.83366394042969,94.83366394042969,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
|
1617
|
+
fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,16,190.0963897705078,190.0963897705078,190.0963897705078,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
|
1618
|
+
fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,2,42.318336486816406,42.15214080810547,42.48453216552734,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
|
1619
|
+
fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,4,82.4616928100586,82.4616928100586,82.4616928100586,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
|
1620
|
+
fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,8,163.43756103515625,163.43756103515625,163.43756103515625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
|
1621
|
+
fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,16,325.4384765625,325.4384765625,325.4384765625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
|
1622
|
+
fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,2,45.99193572998047,45.80761489868165,46.176256561279295,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
|
1623
|
+
fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,4,88.57190704345703,88.57190704345703,88.57190704345703,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
|
1624
|
+
fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,8,176.94105529785156,176.94105529785156,176.94105529785156,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
|
1625
|
+
fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,16,356.0478820800781,356.0478820800781,356.0478820800781,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
|
1626
|
+
fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,2,1.8242560029029846,1.8102271556854248,1.8309119939804077,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
|
1627
|
+
fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,4,1.84934401512146,1.846886396408081,1.8534400463104248,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
|
1628
|
+
fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,8,1.891327977180481,1.8892799615859985,1.8933759927749634,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
|
1629
|
+
fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,16,1.9752960205078125,1.9722239971160889,1.977344036102295,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
|
1630
|
+
fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,2,22.0262393951416,21.80997085571289,22.20482559204102,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
|
1631
|
+
fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,4,41.54521560668945,41.224806213378905,41.865625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
|
1632
|
+
fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,8,81.21753692626953,81.21753692626953,81.21753692626953,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
|
1633
|
+
fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,16,160.82022094726562,160.82022094726562,160.82022094726562,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
|
1634
|
+
fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,2,7344.77685546875,7344.77685546875,7344.77685546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
|
1635
|
+
fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,4,7408.80029296875,7408.80029296875,7408.80029296875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
|
1636
|
+
fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,8,7536.84716796875,7536.84716796875,7536.84716796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
|
1637
|
+
fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,16,7792.94091796875,7792.94091796875,7792.94091796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
|
1638
|
+
fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,2,9083.28125,9083.28125,9083.28125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
|
1639
|
+
fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,4,13138.3125,13138.3125,13138.3125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
|
1640
|
+
fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,8,21250.375,21250.375,21250.375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
|
1641
|
+
fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,16,37474.5,37474.5,37474.5,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
|
1578
1642
|
llama4_rope,liger,forward,speed,ms,H,hidden size,512,0.08249600231647491,0.08102399855852127,0.08432000130414963,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
|
1579
1643
|
llama4_rope,liger,forward,speed,ms,H,hidden size,2048,0.08169600367546082,0.08037760108709335,0.08329600095748901,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
|
1580
1644
|
llama4_rope,liger,forward,speed,ms,H,hidden size,8192,0.08128000050783157,0.07980799674987793,0.08329600095748901,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
|
@@ -0,0 +1,234 @@
|
|
1
|
+
import os
|
2
|
+
import sys
|
3
|
+
|
4
|
+
import torch
|
5
|
+
import triton
|
6
|
+
|
7
|
+
from utils import QUANTILES
|
8
|
+
from utils import SingleBenchmarkRunInput
|
9
|
+
from utils import SingleBenchmarkRunOutput
|
10
|
+
from utils import _test_memory
|
11
|
+
from utils import parse_benchmark_script_args
|
12
|
+
from utils import run_benchmarks
|
13
|
+
|
14
|
+
from liger_kernel.utils import infer_device
|
15
|
+
|
16
|
+
device = infer_device()
|
17
|
+
|
18
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
|
19
|
+
|
20
|
+
|
21
|
+
#############################################################################
|
22
|
+
# Test the memory consumption of the linear fused GRPO loss
|
23
|
+
#############################################################################
|
24
|
+
|
25
|
+
|
26
|
+
def bench_memory_fused_linear_grpo_loss(
|
27
|
+
input: SingleBenchmarkRunInput,
|
28
|
+
) -> SingleBenchmarkRunOutput:
|
29
|
+
from test.chunked_loss.test_grpo_loss import LigerLMHeadGRPO
|
30
|
+
from test.chunked_loss.test_grpo_loss import TorchLMHeadGRPO
|
31
|
+
|
32
|
+
B = input.x
|
33
|
+
T = input.extra_benchmark_config["T"]
|
34
|
+
H = input.extra_benchmark_config["H"]
|
35
|
+
V = input.extra_benchmark_config["V"]
|
36
|
+
dtype = input.extra_benchmark_config["dtype"]
|
37
|
+
importance_sampling_level = input.extra_benchmark_config["importance_sampling_level"]
|
38
|
+
provider = input.kernel_provider
|
39
|
+
|
40
|
+
# Instantiate once and retrieve the first output only
|
41
|
+
torch_lm_head_grpo = TorchLMHeadGRPO(H=H, V=V, dtype=dtype, importance_sampling_level=importance_sampling_level).to(
|
42
|
+
device
|
43
|
+
)
|
44
|
+
liger_lm_head_grpo = LigerLMHeadGRPO(H=H, V=V, dtype=dtype, importance_sampling_level=importance_sampling_level).to(
|
45
|
+
device
|
46
|
+
)
|
47
|
+
|
48
|
+
# Create inputs
|
49
|
+
_input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
|
50
|
+
selected_token_ids = torch.randint(0, V, (B, T), dtype=torch.long, device=device)
|
51
|
+
attention_mask = torch.ones(B, T, device=device)
|
52
|
+
advantages = torch.randn(B, dtype=dtype, device=device)
|
53
|
+
ref_input = torch.randn(B, T, H, dtype=dtype, device=device)
|
54
|
+
|
55
|
+
torch_fwd = lambda: torch_lm_head_grpo(_input, selected_token_ids, attention_mask, advantages, ref_input=ref_input)[
|
56
|
+
0
|
57
|
+
]
|
58
|
+
liger_fwd = lambda: liger_lm_head_grpo(_input, selected_token_ids, attention_mask, advantages, ref_input=ref_input)[
|
59
|
+
0
|
60
|
+
]
|
61
|
+
|
62
|
+
def fwd():
|
63
|
+
if provider == "liger":
|
64
|
+
return liger_fwd()
|
65
|
+
elif provider == "torch":
|
66
|
+
return torch_fwd()
|
67
|
+
|
68
|
+
def full():
|
69
|
+
y = fwd()
|
70
|
+
y.backward()
|
71
|
+
|
72
|
+
mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
|
73
|
+
return SingleBenchmarkRunOutput(
|
74
|
+
y_20=mem_20,
|
75
|
+
y_50=mem_50,
|
76
|
+
y_80=mem_80,
|
77
|
+
)
|
78
|
+
|
79
|
+
|
80
|
+
#############################################################################
|
81
|
+
# Test the speed of the fused linear GRPO loss
|
82
|
+
#############################################################################
|
83
|
+
|
84
|
+
|
85
|
+
def bench_speed_fused_linear_grpo_loss(
|
86
|
+
input: SingleBenchmarkRunInput,
|
87
|
+
) -> SingleBenchmarkRunOutput:
|
88
|
+
from test.chunked_loss.test_grpo_loss import LigerLMHeadGRPO
|
89
|
+
from test.chunked_loss.test_grpo_loss import TorchLMHeadGRPO
|
90
|
+
|
91
|
+
B = input.x
|
92
|
+
T = input.extra_benchmark_config["T"]
|
93
|
+
H = input.extra_benchmark_config["H"]
|
94
|
+
V = input.extra_benchmark_config["V"]
|
95
|
+
dtype = input.extra_benchmark_config["dtype"]
|
96
|
+
importance_sampling_level = input.extra_benchmark_config["importance_sampling_level"]
|
97
|
+
provider = input.kernel_provider
|
98
|
+
mode = input.kernel_operation_mode
|
99
|
+
|
100
|
+
# Instantiate once and retrieve the first output only
|
101
|
+
torch_lm_head_grpo = TorchLMHeadGRPO(H=H, V=V, dtype=dtype, importance_sampling_level=importance_sampling_level).to(
|
102
|
+
device
|
103
|
+
)
|
104
|
+
liger_lm_head_grpo = LigerLMHeadGRPO(H=H, V=V, dtype=dtype, importance_sampling_level=importance_sampling_level).to(
|
105
|
+
device
|
106
|
+
)
|
107
|
+
|
108
|
+
# Create inputs
|
109
|
+
_input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
|
110
|
+
selected_token_ids = torch.randint(0, V, (B, T), dtype=torch.long, device=device)
|
111
|
+
attention_mask = torch.ones(B, T, device=device)
|
112
|
+
advantages = torch.randn(B, dtype=dtype, device=device)
|
113
|
+
ref_input = torch.randn(B, T, H, dtype=dtype, device=device)
|
114
|
+
|
115
|
+
torch_fwd = lambda: torch_lm_head_grpo(_input, selected_token_ids, attention_mask, advantages, ref_input=ref_input)[
|
116
|
+
0
|
117
|
+
]
|
118
|
+
liger_fwd = lambda: liger_lm_head_grpo(_input, selected_token_ids, attention_mask, advantages, ref_input=ref_input)[
|
119
|
+
0
|
120
|
+
]
|
121
|
+
|
122
|
+
def fwd():
|
123
|
+
if provider == "liger":
|
124
|
+
return liger_fwd()
|
125
|
+
elif provider == "torch":
|
126
|
+
return torch_fwd()
|
127
|
+
|
128
|
+
if mode == "forward":
|
129
|
+
ms_50, ms_20, ms_80 = triton.testing.do_bench(
|
130
|
+
fwd,
|
131
|
+
rep=100,
|
132
|
+
quantiles=QUANTILES,
|
133
|
+
)
|
134
|
+
elif mode == "backward":
|
135
|
+
y = fwd()
|
136
|
+
|
137
|
+
ms_50, ms_20, ms_80 = triton.testing.do_bench(
|
138
|
+
lambda: y.backward(retain_graph=True),
|
139
|
+
grad_to_none=[_input],
|
140
|
+
rep=100,
|
141
|
+
quantiles=QUANTILES,
|
142
|
+
)
|
143
|
+
elif mode == "full":
|
144
|
+
|
145
|
+
def full():
|
146
|
+
y = fwd()
|
147
|
+
y.backward()
|
148
|
+
|
149
|
+
ms_50, ms_20, ms_80 = triton.testing.do_bench(
|
150
|
+
full,
|
151
|
+
rep=100,
|
152
|
+
quantiles=QUANTILES,
|
153
|
+
)
|
154
|
+
return SingleBenchmarkRunOutput(
|
155
|
+
y_20=ms_20,
|
156
|
+
y_50=ms_50,
|
157
|
+
y_80=ms_80,
|
158
|
+
)
|
159
|
+
|
160
|
+
|
161
|
+
if __name__ == "__main__":
|
162
|
+
args = parse_benchmark_script_args()
|
163
|
+
|
164
|
+
# Benchmark token-level importance sampling (original GRPO)
|
165
|
+
token_configs = {
|
166
|
+
"kernel_name": "fused_linear_grpo_loss_token",
|
167
|
+
"x_name": "B",
|
168
|
+
"x_label": "B",
|
169
|
+
"x_values": [2**i for i in range(1, 5)],
|
170
|
+
"kernel_providers": ["liger", "torch"],
|
171
|
+
"extra_benchmark_configs": [
|
172
|
+
{
|
173
|
+
"T": 1024,
|
174
|
+
"H": 4096,
|
175
|
+
"V": 128256,
|
176
|
+
"importance_sampling_level": "token",
|
177
|
+
"dtype": torch.bfloat16,
|
178
|
+
}
|
179
|
+
],
|
180
|
+
"overwrite": args.overwrite,
|
181
|
+
}
|
182
|
+
|
183
|
+
# Benchmark sequence-level importance sampling (GSPO)
|
184
|
+
sequence_configs = {
|
185
|
+
"kernel_name": "fused_linear_grpo_loss_sequence",
|
186
|
+
"x_name": "B",
|
187
|
+
"x_label": "B",
|
188
|
+
"x_values": [2**i for i in range(1, 5)],
|
189
|
+
"kernel_providers": ["liger", "torch"],
|
190
|
+
"extra_benchmark_configs": [
|
191
|
+
{
|
192
|
+
"T": 1024,
|
193
|
+
"H": 4096,
|
194
|
+
"V": 128256,
|
195
|
+
"importance_sampling_level": "sequence",
|
196
|
+
"dtype": torch.bfloat16,
|
197
|
+
}
|
198
|
+
],
|
199
|
+
"overwrite": args.overwrite,
|
200
|
+
}
|
201
|
+
|
202
|
+
# Run benchmarks for token-level (GRPO)
|
203
|
+
print("Benchmarking GRPO (token-level importance sampling)...")
|
204
|
+
run_benchmarks(
|
205
|
+
bench_test_fn=bench_speed_fused_linear_grpo_loss,
|
206
|
+
kernel_operation_modes=["forward", "full", "backward"],
|
207
|
+
metric_name="speed",
|
208
|
+
metric_unit="ms",
|
209
|
+
**token_configs,
|
210
|
+
)
|
211
|
+
run_benchmarks(
|
212
|
+
bench_test_fn=bench_memory_fused_linear_grpo_loss,
|
213
|
+
kernel_operation_modes=["full"],
|
214
|
+
metric_name="memory",
|
215
|
+
metric_unit="MB",
|
216
|
+
**token_configs,
|
217
|
+
)
|
218
|
+
|
219
|
+
# Run benchmarks for sequence-level (GSPO)
|
220
|
+
print("Benchmarking GSPO (sequence-level importance sampling)...")
|
221
|
+
run_benchmarks(
|
222
|
+
bench_test_fn=bench_speed_fused_linear_grpo_loss,
|
223
|
+
kernel_operation_modes=["forward", "full", "backward"],
|
224
|
+
metric_name="speed",
|
225
|
+
metric_unit="ms",
|
226
|
+
**sequence_configs,
|
227
|
+
)
|
228
|
+
run_benchmarks(
|
229
|
+
bench_test_fn=bench_memory_fused_linear_grpo_loss,
|
230
|
+
kernel_operation_modes=["full"],
|
231
|
+
metric_name="memory",
|
232
|
+
metric_unit="MB",
|
233
|
+
**sequence_configs,
|
234
|
+
)
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "liger_kernel_nightly"
|
7
|
-
version = "0.6.2.
|
7
|
+
version = "0.6.2.dev20250905160847"
|
8
8
|
description = "Efficient Triton kernels for LLM Training"
|
9
9
|
urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
@@ -34,6 +34,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
|
|
34
34
|
beta=0.04,
|
35
35
|
loss_type="bnpo",
|
36
36
|
max_completion_length=None,
|
37
|
+
importance_sampling_level="token",
|
37
38
|
temperature=1.0,
|
38
39
|
compiled=True,
|
39
40
|
use_ref_model=False,
|
@@ -92,6 +93,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
|
|
92
93
|
beta=beta,
|
93
94
|
loss_type=loss_type,
|
94
95
|
max_completion_length=max_completion_length,
|
96
|
+
importance_sampling_level=importance_sampling_level,
|
95
97
|
temperature=temperature,
|
96
98
|
use_ref_model=use_ref_model,
|
97
99
|
ppo_loss_fn=cls.ppo_loss_fn,
|
@@ -261,6 +263,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
|
|
261
263
|
beta=0.04,
|
262
264
|
loss_type="bnpo",
|
263
265
|
max_completion_length=None,
|
266
|
+
importance_sampling_level="token",
|
264
267
|
temperature=1.0,
|
265
268
|
use_ref_model=False,
|
266
269
|
ppo_loss_fn=None,
|
@@ -292,6 +295,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
|
|
292
295
|
beta=beta,
|
293
296
|
loss_type=loss_type,
|
294
297
|
max_completion_length=max_completion_length,
|
298
|
+
importance_sampling_level=importance_sampling_level,
|
295
299
|
)
|
296
300
|
|
297
301
|
return chunk_loss, chunk_metrics
|
@@ -31,6 +31,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
|
|
31
31
|
beta=0.04,
|
32
32
|
loss_type="bnpo", # ["grpo", "bnpo", "dr_grpo"]
|
33
33
|
max_completion_length=None, # Required for dr_grpo
|
34
|
+
importance_sampling_level="token", # ["token", "sequence"] - new parameter for GSPO
|
34
35
|
**kwargs,
|
35
36
|
):
|
36
37
|
"""GRPO Loss Function matching GRPOTrainer implementation."""
|
@@ -50,7 +51,22 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
|
|
50
51
|
|
51
52
|
# Compute policy gradient loss with importance sampling ratio
|
52
53
|
old_per_token_logps = old_per_token_logps if old_per_token_logps is not None else per_token_logps.detach()
|
53
|
-
|
54
|
+
log_ratio = per_token_logps - old_per_token_logps
|
55
|
+
|
56
|
+
if importance_sampling_level == "token":
|
57
|
+
log_importance_weights = log_ratio
|
58
|
+
elif importance_sampling_level == "sequence":
|
59
|
+
log_importance_weights = (log_ratio * attention_mask).sum(-1) / attention_mask.sum(-1).clamp(min=1.0)
|
60
|
+
log_importance_weights = log_importance_weights.unsqueeze(-1)
|
61
|
+
else:
|
62
|
+
raise ValueError(
|
63
|
+
f"Unknown importance sampling level: {importance_sampling_level}. Possible values are 'token' "
|
64
|
+
"and 'sequence'."
|
65
|
+
)
|
66
|
+
|
67
|
+
# From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
|
68
|
+
# importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
|
69
|
+
coef_1 = torch.exp(log_importance_weights)
|
54
70
|
coef_2 = clip_coef_fn(coef_1, epsilon_low, epsilon_high)
|
55
71
|
per_token_loss1 = coef_1 * advantages.unsqueeze(1)
|
56
72
|
per_token_loss2 = coef_2 * advantages.unsqueeze(1)
|
@@ -85,9 +101,19 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
|
|
85
101
|
metrics = []
|
86
102
|
if beta != 0.0:
|
87
103
|
metrics.append(((kl_div * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)))
|
88
|
-
|
89
|
-
|
90
|
-
|
104
|
+
|
105
|
+
# Adjust clipping metric calculation based on importance sampling level
|
106
|
+
if importance_sampling_level == "token":
|
107
|
+
is_clipped = ((coef_1 < 1 - epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
|
108
|
+
(coef_1 > 1 + epsilon_high) & (advantages.unsqueeze(1) > 0)
|
109
|
+
)
|
110
|
+
else: # sequence level
|
111
|
+
# For sequence level, coef_1 is shape (B, 1), advantages is shape (B,)
|
112
|
+
is_clipped = ((coef_1.squeeze(-1) < 1 - epsilon_low) & (advantages < 0)) | (
|
113
|
+
(coef_1.squeeze(-1) > 1 + epsilon_high) & (advantages > 0)
|
114
|
+
)
|
115
|
+
is_clipped = is_clipped.unsqueeze(1).expand_as(attention_mask)
|
116
|
+
|
91
117
|
metrics.append((is_clipped * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0))
|
92
118
|
return loss, metrics
|
93
119
|
|
@@ -111,6 +137,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
|
|
111
137
|
epsilon_high=0.2,
|
112
138
|
loss_type="bnpo",
|
113
139
|
max_completion_length=None,
|
140
|
+
importance_sampling_level="token",
|
114
141
|
temperature=1.0,
|
115
142
|
compiled=True,
|
116
143
|
use_ref_model=True,
|
@@ -132,6 +159,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
|
|
132
159
|
beta (float): Weight for the KL penalty
|
133
160
|
loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
|
134
161
|
max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
|
162
|
+
importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
|
135
163
|
temperature (float): Temperature for the logits
|
136
164
|
compiled (bool): Whether to use torch compile
|
137
165
|
use_ref_model (bool): Whether to use a reference model
|
@@ -162,6 +190,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
|
|
162
190
|
compiled=compiled,
|
163
191
|
use_ref_model=use_ref_model,
|
164
192
|
chunk_size=chunk_size,
|
193
|
+
importance_sampling_level=importance_sampling_level,
|
165
194
|
)
|
166
195
|
|
167
196
|
@staticmethod
|
@@ -187,6 +216,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
|
|
187
216
|
None, # grad_epsilon_high
|
188
217
|
None, # grad_loss_type (string, not differentiable)
|
189
218
|
None, # grad_max_completion_length (int, not differentiable)
|
219
|
+
None, # grad_importance_sampling_level (string, not differentiable)
|
190
220
|
None, # grad_temperature
|
191
221
|
None, # grad_compiled
|
192
222
|
None, # grad_use_ref_model
|
@@ -207,6 +237,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
|
|
207
237
|
epsilon_high: float = 0.2,
|
208
238
|
loss_type: str = "bnpo",
|
209
239
|
max_completion_length: Optional[int] = None,
|
240
|
+
importance_sampling_level: str = "token",
|
210
241
|
temperature: float = 1.0,
|
211
242
|
):
|
212
243
|
"""
|
@@ -219,6 +250,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
|
|
219
250
|
epsilon_high (float): Upper bound for the importance sampling ratio.
|
220
251
|
loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
|
221
252
|
max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
|
253
|
+
importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
|
222
254
|
temperature (float): Temperature for the logits.
|
223
255
|
"""
|
224
256
|
super().__init__()
|
@@ -230,6 +262,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
|
|
230
262
|
self.epsilon_high = epsilon_high
|
231
263
|
self.loss_type = loss_type
|
232
264
|
self.max_completion_length = max_completion_length
|
265
|
+
self.importance_sampling_level = importance_sampling_level
|
233
266
|
self.temperature = temperature
|
234
267
|
|
235
268
|
def forward(
|
@@ -263,6 +296,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
|
|
263
296
|
self.epsilon_high,
|
264
297
|
self.loss_type,
|
265
298
|
self.max_completion_length,
|
299
|
+
self.importance_sampling_level,
|
266
300
|
self.temperature,
|
267
301
|
self.compiled,
|
268
302
|
self.use_ref_model,
|
@@ -36,6 +36,7 @@ if TYPE_CHECKING:
|
|
36
36
|
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_gemma3_text # noqa: F401
|
37
37
|
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4 # noqa: F401
|
38
38
|
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4v # noqa: F401
|
39
|
+
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4v_moe # noqa: F401
|
39
40
|
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_granite # noqa: F401
|
40
41
|
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama # noqa: F401
|
41
42
|
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama4 # noqa: F401
|
@@ -95,6 +96,7 @@ def __getattr__(name: str):
|
|
95
96
|
"apply_liger_kernel_to_gemma3_text",
|
96
97
|
"apply_liger_kernel_to_glm4",
|
97
98
|
"apply_liger_kernel_to_glm4v",
|
99
|
+
"apply_liger_kernel_to_glm4v_moe",
|
98
100
|
"apply_liger_kernel_to_granite",
|
99
101
|
"apply_liger_kernel_to_llama",
|
100
102
|
"apply_liger_kernel_to_llava",
|
@@ -159,6 +161,7 @@ if _TRANSFORMERS_AVAILABLE:
|
|
159
161
|
"apply_liger_kernel_to_gemma3_text",
|
160
162
|
"apply_liger_kernel_to_glm4",
|
161
163
|
"apply_liger_kernel_to_glm4v",
|
164
|
+
"apply_liger_kernel_to_glm4v_moe",
|
162
165
|
"apply_liger_kernel_to_granite",
|
163
166
|
"apply_liger_kernel_to_llama",
|
164
167
|
"apply_liger_kernel_to_llava",
|