liger-kernel-nightly 0.6.0.dev20250719041120__tar.gz → 0.6.0.dev20250719041256__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/PKG-INFO +1 -1
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/data/all_benchmark_data.csv +41 -31
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/pyproject.toml +1 -1
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/layer_norm.py +126 -88
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel_nightly.egg-info/PKG-INFO +1 -1
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_layer_norm.py +3 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/pull_request_template.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/workflows/amd-ci.yml +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/workflows/benchmark.yml +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/workflows/docs.yml +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/workflows/intel-ci.yml +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/workflows/nvi-ci.yml +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/workflows/publish-nightly.yml +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/.github/workflows/publish-release.yml +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/.gitignore +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/LICENSE +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/Makefile +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/NOTICE +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/README.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/README.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/__init__.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/benchmarks_visualizer.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/__init__.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_dyt.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_embedding.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_geglu.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_group_norm.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_jsd.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_kl_div.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_rope.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_softmax.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_swiglu.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/benchmark_tvd.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/benchmark/scripts/utils.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/dev/fmt-requirements.txt +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/dev/modal/benchmarks.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/dev/modal/tests.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/dev/modal/tests_bwd.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/Examples.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/Getting-Started.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/High-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/Low-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/acknowledgement.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/contributing.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/images/banner.GIF +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/images/compose.gif +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/images/e2e-memory.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/images/e2e-tps.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/images/logo-banner.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/images/patch.gif +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/images/post-training.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/index.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/docs/license.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/alignment/accelerate_config.yaml +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/alignment/run_orpo.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/README.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/callback.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/config/fsdp_config.json +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/img/gemma_7b_mem.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/img/gemma_7b_tp.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/img/llama_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/img/llama_tps.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/img/qwen_tps.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/launch_on_modal.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/run_benchmarks.sh +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/run_gemma.sh +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/run_llama.sh +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/run_qwen.sh +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/run_qwen2_vl.sh +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/training.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/huggingface/training_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/lightning/README.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/lightning/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/lightning/training.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/README.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/callback.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/medusa_util.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/examples/medusa/train.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/licenses/LICENSE-Apache-2.0 +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/licenses/LICENSE-MIT-AutoAWQ +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/licenses/LICENSE-MIT-llmc +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/licenses/LICENSE-MIT-triton +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/mkdocs.yml +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/setup.cfg +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/setup.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/__init__.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/README.md +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/functional.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/env_report.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/__init__.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/dyt.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/geglu.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/group_norm.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/jsd.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/kl_div.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/rope.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/softmax.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/swiglu.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/tvd.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/ops/utils.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/__init__.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/auto_model.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/dyt.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/fsdp.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/functional.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/geglu.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/group_norm.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/jsd.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/kl_div.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/__init__.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/gemma.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/gemma2.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/gemma3.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/glm4.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/llama.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/llama4.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/llava.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/mistral.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/mixtral.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/mllama.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/olmo2.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/paligemma.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/phi3.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/qwen2.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/qwen3.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/model/smollm3.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/rope.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/softmax.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/swiglu.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/trainer_integration.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/transformers/tvd.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/triton/__init__.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/triton/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel/utils.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel_nightly.egg-info/SOURCES.txt +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/__init__.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_cosine_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/chunked_loss/test_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/conftest.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/__init__.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/bf16/__init__.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/bf16/test_mini_models.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/bf16/test_mini_models_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/bf16/test_mini_models_with_logits.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/fp32/__init__.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/fp32/test_mini_models.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/convergence/fp32/test_mini_models_with_logits.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/tiny_shakespeare.txt +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_auto_model.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_dyt.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_embedding.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_flex_attention.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_geglu.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_group_norm.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_jsd.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_kl_div.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_monkey_patch.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_rope.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_softmax.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_swiglu.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_trainer_integration.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_transformers.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/transformers/test_tvd.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/triton/test_triton_monkey_patch.py +0 -0
- {liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/test/utils.py +0 -0
@@ -625,36 +625,6 @@ group_norm,huggingface,backward,memory,MB,C,num_channels,256,320.5078125,320.507
|
|
625
625
|
group_norm,huggingface,backward,memory,MB,C,num_channels,512,641.015625,641.015625,641.015625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
|
626
626
|
group_norm,huggingface,backward,memory,MB,C,num_channels,1024,1282.03125,1282.03125,1282.03125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
|
627
627
|
group_norm,huggingface,backward,memory,MB,C,num_channels,2048,2564.0625,2564.0625,2564.0625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
|
628
|
-
layer_norm,liger,forward,speed,ms,N,hidden size,1024,0.035840000957250595,0.03481600061058998,0.035840000957250595,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
|
629
|
-
layer_norm,liger,forward,speed,ms,N,hidden size,2048,0.05939200147986412,0.058368001133203506,0.060416001826524734,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
|
630
|
-
layer_norm,liger,forward,speed,ms,N,hidden size,4096,0.10751999914646149,0.10751999914646149,0.1085439994931221,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
|
631
|
-
layer_norm,liger,forward,speed,ms,N,hidden size,8192,0.20582400262355804,0.20479999482631683,0.20684799551963806,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
|
632
|
-
layer_norm,liger,forward,speed,ms,N,hidden size,16384,0.3993600010871887,0.3983359932899475,0.40140798687934875,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
|
633
|
-
layer_norm,huggingface,forward,speed,ms,N,hidden size,1024,0.03788800165057182,0.03788800165057182,0.03891199827194214,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
|
634
|
-
layer_norm,huggingface,forward,speed,ms,N,hidden size,2048,0.0655359998345375,0.0655359998345375,0.06656000018119812,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
|
635
|
-
layer_norm,huggingface,forward,speed,ms,N,hidden size,4096,0.14745600521564484,0.14643199741840363,0.14847999811172485,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
|
636
|
-
layer_norm,huggingface,forward,speed,ms,N,hidden size,8192,0.31334400177001953,0.3123199939727783,0.31436800956726074,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
|
637
|
-
layer_norm,huggingface,forward,speed,ms,N,hidden size,16384,0.6133760213851929,0.6123520135879517,0.6154239773750305,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
|
638
|
-
layer_norm,liger,full,speed,ms,N,hidden size,1024,0.6860799789428711,0.6146048903465271,0.7049216032028198,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
|
639
|
-
layer_norm,liger,full,speed,ms,N,hidden size,2048,0.6789119839668274,0.6737920045852661,0.6912000179290771,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
|
640
|
-
layer_norm,liger,full,speed,ms,N,hidden size,4096,0.6686720252037048,0.6635519862174988,0.681984007358551,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
|
641
|
-
layer_norm,liger,full,speed,ms,N,hidden size,8192,0.6789119839668274,0.5908480286598206,0.6932479739189148,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
|
642
|
-
layer_norm,liger,full,speed,ms,N,hidden size,16384,6.071296215057373,5.331148624420166,6.08235502243042,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
|
643
|
-
layer_norm,huggingface,full,speed,ms,N,hidden size,1024,0.13312000036239624,0.13209599256515503,0.13312000036239624,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
644
|
-
layer_norm,huggingface,full,speed,ms,N,hidden size,2048,0.23244799673557281,0.2303999960422516,0.23347200453281403,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
645
|
-
layer_norm,huggingface,full,speed,ms,N,hidden size,4096,0.5242879986763,0.5232639908790588,0.5263360142707825,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
646
|
-
layer_norm,huggingface,full,speed,ms,N,hidden size,8192,1.0168319940567017,1.0147839784622192,1.018880009651184,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
647
|
-
layer_norm,huggingface,full,speed,ms,N,hidden size,16384,1.994752049446106,1.9916800260543823,1.9967999458312988,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
648
|
-
layer_norm,liger,full,memory,MB,N,hidden size,1024,80.90625,80.90625,80.90625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
649
|
-
layer_norm,liger,full,memory,MB,N,hidden size,2048,161.78125,161.78125,161.78125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
650
|
-
layer_norm,liger,full,memory,MB,N,hidden size,4096,323.53125,323.53125,323.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
651
|
-
layer_norm,liger,full,memory,MB,N,hidden size,8192,647.03125,647.03125,647.03125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
652
|
-
layer_norm,liger,full,memory,MB,N,hidden size,16384,1294.03125,1294.03125,1294.03125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
653
|
-
layer_norm,huggingface,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
|
654
|
-
layer_norm,huggingface,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
|
655
|
-
layer_norm,huggingface,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
|
656
|
-
layer_norm,huggingface,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
|
657
|
-
layer_norm,huggingface,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
|
658
628
|
fused_linear_orpo_loss,liger,forward,speed,ms,B,B,2,116.00621032714844,116.00621032714844,116.00621032714844,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
|
659
629
|
fused_linear_orpo_loss,liger,forward,speed,ms,B,B,4,230.83609008789062,230.83609008789062,230.83609008789062,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
|
660
630
|
fused_linear_orpo_loss,liger,forward,speed,ms,B,B,8,461.9543151855469,461.9543151855469,461.9543151855469,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
|
@@ -1493,6 +1463,46 @@ distill_cosine_loss,torch,full,memory,MB,BT,B x T,1024,7566.2822265625,7566.2822
|
|
1493
1463
|
distill_cosine_loss,torch,full,memory,MB,BT,B x T,2048,11590.3134765625,11590.3134765625,11590.3134765625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
|
1494
1464
|
distill_cosine_loss,torch,full,memory,MB,BT,B x T,4096,19654.375,19654.375,19654.375,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
|
1495
1465
|
distill_cosine_loss,torch,full,memory,MB,BT,B x T,8192,35782.5,35782.5,35782.5,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
|
1466
|
+
layer_norm,liger,forward,speed,ms,N,hidden size,1024,0.018848000094294548,0.018400000408291817,0.020102400332689285,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
|
1467
|
+
layer_norm,liger,forward,speed,ms,N,hidden size,2048,0.029152000322937965,0.02876799926161766,0.029823999851942062,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
|
1468
|
+
layer_norm,liger,forward,speed,ms,N,hidden size,4096,0.05104000121355057,0.05036799982190132,0.05177599936723709,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
|
1469
|
+
layer_norm,liger,forward,speed,ms,N,hidden size,8192,0.0947519987821579,0.09436800330877304,0.09507200121879578,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
|
1470
|
+
layer_norm,liger,forward,speed,ms,N,hidden size,16384,0.18476800620555878,0.18396799266338348,0.1852159947156906,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
|
1471
|
+
layer_norm,huggingface,forward,speed,ms,N,hidden size,1024,0.023584000766277313,0.023423999547958374,0.023840000852942467,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
|
1472
|
+
layer_norm,huggingface,forward,speed,ms,N,hidden size,2048,0.03734400123357773,0.03702399879693985,0.037811201065778746,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
|
1473
|
+
layer_norm,huggingface,forward,speed,ms,N,hidden size,4096,0.06617599725723267,0.06560000032186508,0.06678400188684464,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
|
1474
|
+
layer_norm,huggingface,forward,speed,ms,N,hidden size,8192,0.15267199277877808,0.15190400183200836,0.15347200632095337,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
|
1475
|
+
layer_norm,huggingface,forward,speed,ms,N,hidden size,16384,0.3067840039730072,0.3046143889427185,0.3081152021884918,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
|
1476
|
+
layer_norm,liger,backward,speed,ms,N,hidden size,1024,0.12006399780511856,0.11653760075569153,0.12467200309038162,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
|
1477
|
+
layer_norm,liger,backward,speed,ms,N,hidden size,2048,0.1207360029220581,0.1176128014922142,0.1256511986255646,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
|
1478
|
+
layer_norm,liger,backward,speed,ms,N,hidden size,4096,0.16630400717258453,0.16412800550460815,0.16838400065898895,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
|
1479
|
+
layer_norm,liger,backward,speed,ms,N,hidden size,8192,0.31279999017715454,0.31116798520088196,0.3145279884338379,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
|
1480
|
+
layer_norm,liger,backward,speed,ms,N,hidden size,16384,0.5776320099830627,0.5753471970558167,0.5798912048339844,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
|
1481
|
+
layer_norm,huggingface,backward,speed,ms,N,hidden size,1024,0.0605119988322258,0.059647999703884125,0.061344001442193985,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
|
1482
|
+
layer_norm,huggingface,backward,speed,ms,N,hidden size,2048,0.09967999905347824,0.09849599748849869,0.10099200159311295,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
|
1483
|
+
layer_norm,huggingface,backward,speed,ms,N,hidden size,4096,0.17881600558757782,0.17795200645923615,0.17971199750900269,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
|
1484
|
+
layer_norm,huggingface,backward,speed,ms,N,hidden size,8192,0.33369600772857666,0.3328000009059906,0.33478400111198425,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
|
1485
|
+
layer_norm,huggingface,backward,speed,ms,N,hidden size,16384,0.6424000263214111,0.6412223815917969,0.643455982208252,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
|
1486
|
+
layer_norm,liger,full,speed,ms,N,hidden size,1024,0.26576000452041626,0.2629248082637787,0.2701759934425354,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
|
1487
|
+
layer_norm,liger,full,speed,ms,N,hidden size,2048,0.27427199482917786,0.26999040842056277,0.28091518878936766,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
|
1488
|
+
layer_norm,liger,full,speed,ms,N,hidden size,4096,0.27454400062561035,0.27004799246788025,0.2807359993457794,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
|
1489
|
+
layer_norm,liger,full,speed,ms,N,hidden size,8192,0.40556800365448,0.40403199195861816,0.40723198652267456,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
|
1490
|
+
layer_norm,liger,full,speed,ms,N,hidden size,16384,0.7608960270881653,0.7589311957359314,0.7631679773330688,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
|
1491
|
+
layer_norm,huggingface,full,speed,ms,N,hidden size,1024,0.08025600016117096,0.07942400127649307,0.08111999928951263,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1492
|
+
layer_norm,huggingface,full,speed,ms,N,hidden size,2048,0.13315199315547943,0.13180799782276154,0.13468800485134125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1493
|
+
layer_norm,huggingface,full,speed,ms,N,hidden size,4096,0.2417600005865097,0.24089600145816803,0.24262399971485138,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1494
|
+
layer_norm,huggingface,full,speed,ms,N,hidden size,8192,0.4832639992237091,0.48214399814605713,0.4843647956848145,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1495
|
+
layer_norm,huggingface,full,speed,ms,N,hidden size,16384,0.950575977563858,0.9484800100326538,0.9528064012527466,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1496
|
+
layer_norm,liger,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1497
|
+
layer_norm,liger,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1498
|
+
layer_norm,liger,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1499
|
+
layer_norm,liger,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1500
|
+
layer_norm,liger,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1501
|
+
layer_norm,huggingface,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1502
|
+
layer_norm,huggingface,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1503
|
+
layer_norm,huggingface,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1504
|
+
layer_norm,huggingface,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1505
|
+
layer_norm,huggingface,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
1496
1506
|
fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,1024,0.01759999990463257,0.017311999574303627,0.017920000478625298,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
|
1497
1507
|
fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,2048,0.02924799919128418,0.028863999992609024,0.029983999207615852,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
|
1498
1508
|
fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,4096,0.05129599943757057,0.050624001771211624,0.05209600180387497,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
|
@@ -1564,4 +1574,4 @@ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,2048,208.06298828
|
|
1564
1574
|
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,4096,416.11767578125,416.11767578125,416.11767578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
1565
1575
|
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,8192,832.22705078125,832.22705078125,832.22705078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
1566
1576
|
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,16384,1544.44580078125,1544.44580078125,1544.44580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
1567
|
-
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
1577
|
+
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "liger_kernel_nightly"
|
7
|
-
version = "0.6.0.
|
7
|
+
version = "0.6.0.dev20250719041256"
|
8
8
|
description = "Efficient Triton kernels for LLM Training"
|
9
9
|
urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import math
|
2
1
|
import operator
|
3
2
|
|
4
3
|
import torch
|
@@ -43,30 +42,45 @@ def _layer_norm_forward_kernel(
|
|
43
42
|
https://arxiv.org/abs/1607.06450
|
44
43
|
https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
|
45
44
|
"""
|
46
|
-
row_idx = tl.program_id(0)
|
45
|
+
row_idx = tl.program_id(0).to(tl.int64)
|
47
46
|
col_offsets = tl.arange(0, BLOCK_SIZE)
|
48
47
|
mask = col_offsets < n_cols
|
49
48
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
49
|
+
# Pre-load weights and bias in fp32 to avoid repeated conversions
|
50
|
+
W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
|
51
|
+
B_row = tl.load(B_ptr + col_offsets, mask=mask, other=0.0)
|
52
|
+
W_f32 = W_row.to(tl.float32)
|
53
|
+
B_f32 = B_row.to(tl.float32)
|
54
|
+
|
55
|
+
# Calculate pointers for this row
|
56
|
+
row_X_ptr = X_ptr + row_idx * X_row_stride
|
57
|
+
row_Y_ptr = Y_ptr + row_idx * Y_row_stride
|
58
|
+
row_Mean_ptr = Mean_ptr + row_idx * Mean_row_stride
|
59
|
+
row_RSTD_ptr = RSTD_ptr + row_idx * RSTD_row_stride
|
60
|
+
|
61
|
+
# Load input data and convert to fp32 for numerical stability
|
62
|
+
X_row = tl.load(row_X_ptr + col_offsets, mask=mask, other=0.0)
|
63
|
+
X_f32 = X_row.to(tl.float32)
|
64
|
+
|
65
|
+
# Compute statistics in fp32 for numerical stability
|
66
|
+
n_cols_f32 = n_cols.to(tl.float32)
|
67
|
+
mean = tl.sum(X_f32, axis=0) / n_cols_f32
|
68
|
+
X_centered = X_f32 - mean
|
69
|
+
# Apply mask to variance calculation to exclude contributions from masked elements
|
70
|
+
X_centered_masked = tl.where(mask, X_centered, 0.0)
|
71
|
+
var = tl.sum(X_centered_masked * X_centered_masked, axis=0) / n_cols_f32
|
62
72
|
rstd = rsqrt(var + eps)
|
63
73
|
|
64
|
-
|
65
|
-
tl.store(
|
74
|
+
# Store statistics (convert back to original dtype only once)
|
75
|
+
tl.store(row_Mean_ptr, mean.to(X_row.dtype))
|
76
|
+
tl.store(row_RSTD_ptr, rstd.to(X_row.dtype))
|
66
77
|
|
67
|
-
|
78
|
+
# Fused normalization and affine transformation
|
79
|
+
# Y = (X - mean) * rstd * W + B = X_centered * rstd * W + B
|
80
|
+
Y_f32 = X_centered * rstd * W_f32 + B_f32
|
68
81
|
|
69
|
-
|
82
|
+
# Store output (single conversion back to original dtype)
|
83
|
+
tl.store(row_Y_ptr + col_offsets, Y_f32.to(X_row.dtype), mask=mask)
|
70
84
|
|
71
85
|
|
72
86
|
@triton.jit
|
@@ -81,73 +95,87 @@ def _layer_norm_backward_kernel(
|
|
81
95
|
DY_ptr, # pointer to output grad, shape (n_rows, n_cols)
|
82
96
|
stride_x, # stride of each row in input
|
83
97
|
stride_dx, # stride of each row in input grad
|
84
|
-
stride_dw, # stride of each row in weights grad
|
85
|
-
stride_db, # stride of each row in bias grad
|
86
98
|
stride_dy, # stride of each row in output grad
|
87
|
-
n_rows,
|
88
99
|
n_cols,
|
89
|
-
rows_per_program: tl.constexpr,
|
90
100
|
BLOCK_SIZE: tl.constexpr,
|
91
101
|
dtype: tl.constexpr,
|
102
|
+
atomic_dtype: tl.constexpr,
|
92
103
|
):
|
93
104
|
"""
|
94
105
|
References:
|
95
106
|
https://arxiv.org/abs/1607.06450
|
96
107
|
https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
|
97
|
-
https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
|
98
|
-
https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/triton/layer_norm.py
|
99
108
|
"""
|
100
|
-
|
101
|
-
row_start = row_block_id * rows_per_program
|
102
|
-
row_end = min((row_block_id + 1) * rows_per_program, n_rows)
|
109
|
+
row_idx = tl.program_id(0).to(tl.int64)
|
103
110
|
cols = tl.arange(0, BLOCK_SIZE)
|
104
111
|
mask = cols < n_cols
|
105
112
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
tl.store(
|
139
|
-
|
113
|
+
# Pre-load weights once (same optimization as forward pass)
|
114
|
+
w = tl.load(W_ptr + cols, mask=mask, other=0.0)
|
115
|
+
w_f32 = w.to(tl.float32)
|
116
|
+
n_cols_f32 = n_cols.to(tl.float32)
|
117
|
+
|
118
|
+
# Calculate pointers for this specific row
|
119
|
+
row_X_ptr = X_ptr + row_idx * stride_x
|
120
|
+
row_DX_ptr = DX_ptr + row_idx * stride_dx
|
121
|
+
row_DY_ptr = DY_ptr + row_idx * stride_dy
|
122
|
+
row_Mean_ptr = Mean_ptr + row_idx
|
123
|
+
row_RSTD_ptr = RSTD_ptr + row_idx
|
124
|
+
|
125
|
+
# Load data for this row
|
126
|
+
x = tl.load(row_X_ptr + cols, mask=mask, other=0.0)
|
127
|
+
dy = tl.load(row_DY_ptr + cols, mask=mask, other=0.0)
|
128
|
+
mean = tl.load(row_Mean_ptr)
|
129
|
+
rstd = tl.load(row_RSTD_ptr)
|
130
|
+
|
131
|
+
# Convert to fp32 for numerical stability
|
132
|
+
x_f32 = x.to(tl.float32)
|
133
|
+
dy_f32 = dy.to(tl.float32)
|
134
|
+
mean_f32 = mean.to(tl.float32)
|
135
|
+
rstd_f32 = rstd.to(tl.float32)
|
136
|
+
|
137
|
+
# Compute backward pass for this row
|
138
|
+
x_hat = (x_f32 - mean_f32) * rstd_f32
|
139
|
+
wdy = w_f32 * dy_f32
|
140
|
+
c1 = tl.sum(x_hat * wdy, axis=0) / n_cols_f32
|
141
|
+
c2 = tl.sum(wdy, axis=0) / n_cols_f32
|
142
|
+
dx = (wdy - (x_hat * c1 + c2)) * rstd_f32
|
143
|
+
|
144
|
+
# Store input gradient
|
145
|
+
tl.store(row_DX_ptr + cols, dx.to(dtype), mask=mask)
|
146
|
+
|
147
|
+
# Accumulate weight and bias gradients using atomic operations
|
148
|
+
dw = dy_f32 * x_hat
|
149
|
+
db = dy_f32
|
150
|
+
tl.atomic_add(DW_ptr + cols, dw.to(atomic_dtype), mask=mask)
|
151
|
+
tl.atomic_add(DB_ptr + cols, db.to(atomic_dtype), mask=mask)
|
140
152
|
|
141
153
|
|
142
154
|
def layer_norm_forward(X, W, B, eps):
|
155
|
+
"""
|
156
|
+
Args:
|
157
|
+
X: Input tensor of shape (..., hidden_size)
|
158
|
+
W: Weight tensor of shape (hidden_size,)
|
159
|
+
B: Bias tensor of shape (hidden_size,)
|
160
|
+
eps: Small constant for numerical stability
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
Tuple of (output, input, mean, rstd, block_size, num_warps)
|
164
|
+
"""
|
143
165
|
shape = X.shape
|
144
166
|
dim = shape[-1]
|
145
167
|
X = X.view(-1, dim)
|
146
168
|
n_rows, n_cols = X.shape
|
169
|
+
|
170
|
+
# Calculate optimal block size and warp configuration
|
147
171
|
BLOCK_SIZE, num_warps = calculate_settings(n_cols)
|
172
|
+
|
173
|
+
# Allocate output tensors
|
148
174
|
Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
|
149
175
|
Mean = torch.empty(n_rows, dtype=X.dtype, device=X.device)
|
150
176
|
RSTD = torch.empty(n_rows, dtype=X.dtype, device=X.device)
|
177
|
+
|
178
|
+
# Validate input dimensions
|
151
179
|
if X.shape[1] != W.shape[0]:
|
152
180
|
raise ValueError(
|
153
181
|
f"Incompatible dimensions: input feature size (X.shape[1]={X.shape[1]}) "
|
@@ -159,7 +187,9 @@ def layer_norm_forward(X, W, B, eps):
|
|
159
187
|
if X.device.type == "xpu":
|
160
188
|
kernel_args["grf_mode"] = "large"
|
161
189
|
|
162
|
-
|
190
|
+
# Launch kernel with one thread block per row for optimal performance
|
191
|
+
grid = (n_rows,)
|
192
|
+
_layer_norm_forward_kernel[grid](
|
163
193
|
Y,
|
164
194
|
Y.stride(0),
|
165
195
|
X,
|
@@ -176,35 +206,43 @@ def layer_norm_forward(X, W, B, eps):
|
|
176
206
|
eps,
|
177
207
|
BLOCK_SIZE=BLOCK_SIZE,
|
178
208
|
num_warps=num_warps,
|
179
|
-
**kernel_args,
|
209
|
+
**kernel_args,
|
180
210
|
)
|
211
|
+
|
181
212
|
return Y.view(*shape), X, Mean, RSTD, BLOCK_SIZE, num_warps
|
182
213
|
|
183
214
|
|
184
215
|
def layer_norm_backward(dY, X, W, B, Mean, RSTD):
|
216
|
+
"""
|
217
|
+
Args:
|
218
|
+
dY: Gradient of output
|
219
|
+
X: Input tensor
|
220
|
+
W: Weight tensor
|
221
|
+
B: Bias tensor
|
222
|
+
Mean: Pre-computed mean
|
223
|
+
RSTD: Pre-computed reciprocal standard deviation
|
224
|
+
|
225
|
+
Returns:
|
226
|
+
Tuple of (input_grad, weight_grad, bias_grad)
|
227
|
+
"""
|
185
228
|
shape = dY.shape
|
186
229
|
dim = shape[-1]
|
187
230
|
dY = dY.view(-1, dim)
|
188
231
|
n_rows, n_cols = dY.shape
|
189
232
|
|
190
|
-
|
191
|
-
if X.device.type == "cuda":
|
192
|
-
sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
|
193
|
-
elif X.device.type == "xpu":
|
194
|
-
sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
|
195
|
-
|
233
|
+
# Allocate gradient tensors
|
196
234
|
DX = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
|
197
|
-
|
198
|
-
|
235
|
+
# Use float32 for weight/bias gradients if bfloat16 (due to atomic_add limitation)
|
236
|
+
grad_dtype = torch.float32 if W.dtype == torch.bfloat16 else W.dtype
|
237
|
+
DW = torch.zeros(n_cols, dtype=grad_dtype, device=W.device)
|
238
|
+
DB = torch.zeros(n_cols, dtype=grad_dtype, device=W.device)
|
199
239
|
|
240
|
+
# Calculate optimal block size and warp configuration
|
200
241
|
BLOCK_SIZE, num_warps = calculate_settings(n_cols)
|
201
242
|
if n_cols > BLOCK_SIZE:
|
202
|
-
raise RuntimeError(
|
203
|
-
f"Feature dimension {n_cols} exceeds maximum supported size of {BLOCK_SIZE}. Consider using a smaller feature dimension."
|
204
|
-
)
|
243
|
+
raise RuntimeError(f"Feature dimension {n_cols} exceeds maximum supported size of {BLOCK_SIZE}.")
|
205
244
|
|
206
|
-
|
207
|
-
grid = (sm_count,)
|
245
|
+
# Determine dtype for triton operations
|
208
246
|
triton_dtype = (
|
209
247
|
tl.float32
|
210
248
|
if X.dtype == torch.float32
|
@@ -212,41 +250,41 @@ def layer_norm_backward(dY, X, W, B, Mean, RSTD):
|
|
212
250
|
if X.dtype == torch.bfloat16
|
213
251
|
else tl.float16
|
214
252
|
if X.dtype == torch.float16
|
215
|
-
else tl.float32 # fallback
|
253
|
+
else tl.float32 # fallback
|
216
254
|
)
|
217
255
|
|
256
|
+
# Use float32 for atomic operations if bfloat16 is not supported
|
257
|
+
atomic_dtype = tl.float32 if triton_dtype == tl.bfloat16 else triton_dtype
|
258
|
+
|
218
259
|
# XPU-specific optimization
|
219
260
|
kernel_args = {}
|
220
261
|
if X.device.type == "xpu":
|
221
262
|
kernel_args.update({"grf_mode": "large", "num_warps": 32, "num_stages": 4})
|
222
263
|
|
264
|
+
# Launch kernel with one thread block per row for optimal performance
|
265
|
+
grid = (n_rows,)
|
223
266
|
_layer_norm_backward_kernel[grid](
|
224
267
|
X,
|
225
268
|
W,
|
226
269
|
Mean,
|
227
270
|
RSTD,
|
228
271
|
DX,
|
229
|
-
|
230
|
-
|
272
|
+
DW,
|
273
|
+
DB,
|
231
274
|
dY,
|
232
275
|
X.stride(0),
|
233
276
|
DX.stride(0),
|
234
|
-
_DW.stride(0),
|
235
|
-
_DB.stride(0),
|
236
277
|
dY.stride(0),
|
237
|
-
n_rows,
|
238
278
|
n_cols,
|
239
|
-
rows_per_program,
|
240
279
|
BLOCK_SIZE=BLOCK_SIZE,
|
241
280
|
dtype=triton_dtype,
|
242
|
-
|
281
|
+
atomic_dtype=atomic_dtype,
|
282
|
+
num_warps=num_warps,
|
283
|
+
**kernel_args,
|
243
284
|
)
|
244
285
|
|
245
|
-
DW = _DW.sum(dim=0).to(W.dtype)
|
246
|
-
DB = _DB.sum(dim=0).to(W.dtype)
|
247
|
-
|
248
286
|
DX = DX.view(*shape)
|
249
|
-
return DX, DW, DB
|
287
|
+
return DX, DW.to(W.dtype), DB.to(W.dtype)
|
250
288
|
|
251
289
|
|
252
290
|
class LigerLayerNormFunction(torch.autograd.Function):
|
@@ -16,12 +16,14 @@ device = infer_device()
|
|
16
16
|
(4, 16, 128),
|
17
17
|
(1, 1, 1023), # Minimal batch/seq with near power-of-2 hidden
|
18
18
|
(3, 7, 256), # Prime numbers for batch/seq
|
19
|
+
(1, 1, 1500),
|
19
20
|
],
|
20
21
|
)
|
21
22
|
@pytest.mark.parametrize(
|
22
23
|
"dtype, atol, rtol",
|
23
24
|
[
|
24
25
|
(torch.float32, 1e-5, 1e-5),
|
26
|
+
(torch.bfloat16, 2e-2, 2e-2), # Relaxed tolerance for bfloat16 due to lower precision + atomic limitations
|
25
27
|
],
|
26
28
|
)
|
27
29
|
def test_liger_layer_norm(
|
@@ -72,6 +74,7 @@ def test_liger_layer_norm(
|
|
72
74
|
"dtype, atol, rtol",
|
73
75
|
[
|
74
76
|
(torch.float32, 1e-5, 1e-5),
|
77
|
+
(torch.bfloat16, 2e-2, 2e-2), # Relaxed tolerance for bfloat16 due to lower precision + atomic limitations
|
75
78
|
],
|
76
79
|
)
|
77
80
|
def test_liger_layer_norm_functional(
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{liger_kernel_nightly-0.6.0.dev20250719041120 → liger_kernel_nightly-0.6.0.dev20250719041256}/NOTICE
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|