liger-kernel 0.6.0__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- liger_kernel-0.6.1/.github/workflows/benchmark.yml +167 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/PKG-INFO +11 -13
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/README.md +10 -12
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/data/all_benchmark_data.csv +112 -30
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_cpo_loss.py +14 -8
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_dpo_loss.py +14 -16
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_embedding.py +8 -0
- liger_kernel-0.6.1/benchmark/scripts/benchmark_fused_add_rms_norm.py +201 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_orpo_loss.py +14 -8
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_simpo_loss.py +14 -8
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/dev/modal/benchmarks.py +1 -1
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/index.md +8 -10
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/pyproject.toml +1 -1
- liger_kernel-0.6.1/src/liger_kernel/ops/fused_add_rms_norm.py +412 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/layer_norm.py +126 -89
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/rms_norm.py +2 -2
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/rope.py +1 -1
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/__init__.py +5 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/functional.py +5 -0
- liger_kernel-0.6.1/src/liger_kernel/transformers/fused_add_rms_norm.py +39 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/gemma3.py +1 -1
- liger_kernel-0.6.1/src/liger_kernel/transformers/model/smollm3.py +189 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/monkey_patch.py +85 -12
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel.egg-info/PKG-INFO +11 -13
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel.egg-info/SOURCES.txt +5 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/bf16/test_mini_models.py +64 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/bf16/test_mini_models_with_logits.py +63 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/fp32/test_mini_models.py +61 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/fp32/test_mini_models_with_logits.py +61 -0
- liger_kernel-0.6.1/test/transformers/test_fused_add_rms_norm.py +219 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_layer_norm.py +3 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_monkey_patch.py +52 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/utils.py +12 -0
- liger_kernel-0.6.0/.github/workflows/benchmark.yml +0 -93
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/pull_request_template.md +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/workflows/amd-ci.yml +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/workflows/docs.yml +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/workflows/intel-ci.yml +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/workflows/nvi-ci.yml +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/workflows/publish-nightly.yml +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.github/workflows/publish-release.yml +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/.gitignore +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/LICENSE +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/Makefile +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/NOTICE +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/README.md +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/__init__.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/benchmarks_visualizer.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/__init__.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_dyt.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_geglu.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_group_norm.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_jsd.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_kl_div.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_kto_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_layer_norm.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_rms_norm.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_rope.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_softmax.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_sparsemax.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_swiglu.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/benchmark_tvd.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/benchmark/scripts/utils.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/dev/fmt-requirements.txt +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/dev/modal/tests.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/dev/modal/tests_bwd.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/Examples.md +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/Getting-Started.md +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/High-Level-APIs.md +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/Low-Level-APIs.md +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/acknowledgement.md +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/contributing.md +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/images/banner.GIF +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/images/compose.gif +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/images/e2e-memory.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/images/e2e-tps.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/images/logo-banner.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/images/patch.gif +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/images/post-training.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/docs/license.md +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/alignment/accelerate_config.yaml +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/alignment/run_orpo.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/README.md +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/callback.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/config/fsdp_config.json +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/img/gemma_7b_mem.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/img/gemma_7b_tp.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/img/llama_mem_alloc.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/img/llama_tps.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/img/qwen_tps.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/launch_on_modal.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/requirements.txt +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/run_benchmarks.sh +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/run_gemma.sh +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/run_llama.sh +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/run_qwen.sh +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/run_qwen2_vl.sh +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/training.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/huggingface/training_multimodal.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/lightning/README.md +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/lightning/requirements.txt +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/lightning/training.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/README.md +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/callback.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/medusa_util.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/requirements.txt +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/examples/medusa/train.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/licenses/LICENSE-Apache-2.0 +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/licenses/LICENSE-MIT-AutoAWQ +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/licenses/LICENSE-MIT-llmc +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/licenses/LICENSE-MIT-triton +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/mkdocs.yml +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/setup.cfg +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/setup.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/__init__.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/README.md +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/__init__.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/functional.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/jsd_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/env_report.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/__init__.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/cross_entropy.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/dyt.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/experimental/embedding.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/geglu.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/group_norm.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/grpo_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/jsd.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/kl_div.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/multi_token_attention.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/softmax.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/sparsemax.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/swiglu.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/tvd.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/ops/utils.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/auto_model.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/cross_entropy.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/dyt.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/fsdp.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/geglu.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/group_norm.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/grpo_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/jsd.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/kl_div.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/layer_norm.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/__init__.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/gemma.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/gemma2.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/glm4.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/llama.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/llama4.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/llava.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/loss_utils.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/mistral.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/mixtral.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/mllama.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/olmo2.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/paligemma.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/phi3.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/qwen2.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/qwen2_5_vl.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/qwen3.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/model/qwen3_moe.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/rms_norm.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/rope.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/softmax.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/sparsemax.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/swiglu.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/trainer_integration.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/transformers/tvd.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/triton/__init__.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/triton/monkey_patch.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel/utils.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel.egg-info/dependency_links.txt +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel.egg-info/requires.txt +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/src/liger_kernel.egg-info/top_level.txt +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/__init__.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/__init__.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_cosine_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_cpo_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_dpo_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_grpo_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_jsd_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_kto_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_orpo_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/chunked_loss/test_simpo_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/conftest.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/__init__.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/bf16/__init__.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/bf16/test_mini_models_multimodal.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/fp32/__init__.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/convergence/fp32/test_mini_models_multimodal.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/tiny_shakespeare.txt +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_auto_model.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_cross_entropy.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_dyt.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_embedding.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_flex_attention.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_fused_linear_cross_entropy.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_fused_linear_jsd.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_fused_neighborhood_attention.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_geglu.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_group_norm.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_grpo_loss.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_jsd.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_kl_div.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_mm_int8int2.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_multi_token_attention.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_qwen2vl_mrope.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_rms_norm.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_rope.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_softmax.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_sparsemax.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_swiglu.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_trainer_integration.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_transformers.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/transformers/test_tvd.py +0 -0
- {liger_kernel-0.6.0 → liger_kernel-0.6.1}/test/triton/test_triton_monkey_patch.py +0 -0
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
name: Benchmarks
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
schedule:
|
|
7
|
+
# Runs at 00:00 UTC every Friday
|
|
8
|
+
- cron: '0 0 * * 5'
|
|
9
|
+
workflow_dispatch: # Enables manual trigger
|
|
10
|
+
inputs:
|
|
11
|
+
commit_hash:
|
|
12
|
+
description: 'Commit hash to benchmark'
|
|
13
|
+
default: 'main'
|
|
14
|
+
overwrite:
|
|
15
|
+
description: 'Overwrite existing benchmark data if it exists'
|
|
16
|
+
type: boolean
|
|
17
|
+
default: false
|
|
18
|
+
|
|
19
|
+
permissions:
|
|
20
|
+
contents: write
|
|
21
|
+
|
|
22
|
+
concurrency:
|
|
23
|
+
# This causes it to cancel previous in-progress actions on the same PR / branch,
|
|
24
|
+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
|
25
|
+
cancel-in-progress: true
|
|
26
|
+
|
|
27
|
+
jobs:
|
|
28
|
+
benchmarks:
|
|
29
|
+
runs-on: ubuntu-latest
|
|
30
|
+
env:
|
|
31
|
+
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
|
|
32
|
+
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
|
|
33
|
+
GITHUB_USERNAME: linkedin
|
|
34
|
+
REPO_NAME: Liger-Kernel
|
|
35
|
+
OUTPUT_DIR: benchmarks
|
|
36
|
+
OUTPUT_FILENAME: benchmark.csv
|
|
37
|
+
GENERATED_CSV: benchmark/data/all_benchmark_data.csv
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
steps:
|
|
41
|
+
# Step: Decide the commit hash to use
|
|
42
|
+
- name: Determine commit hash to checkout
|
|
43
|
+
id: choose_commit
|
|
44
|
+
run: |
|
|
45
|
+
if [ "${{ github.event.inputs.commit_hash }}" != "" ]; then
|
|
46
|
+
echo "Using manual input commit: ${{ github.event.inputs.commit_hash }}"
|
|
47
|
+
echo "hash=${{ github.event.inputs.commit_hash }}" >> $GITHUB_OUTPUT
|
|
48
|
+
else
|
|
49
|
+
echo "Using latest commit from main"
|
|
50
|
+
git fetch origin main
|
|
51
|
+
echo "hash=$(git rev-parse origin/main)" >> $GITHUB_OUTPUT
|
|
52
|
+
fi
|
|
53
|
+
|
|
54
|
+
# Step: Checkout full history so we can check out any commit
|
|
55
|
+
- name: Checkout full repo history
|
|
56
|
+
uses: actions/checkout@v3
|
|
57
|
+
with:
|
|
58
|
+
fetch-depth: 0 # Important: so we can checkout arbitrary commit
|
|
59
|
+
# Step: Conditionally replace benchmark folder from main
|
|
60
|
+
- name: Replace benchmark folder from main (manual only, commit ≠ main)
|
|
61
|
+
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.commit_hash != 'main' }}
|
|
62
|
+
run: |
|
|
63
|
+
echo "Detected manual trigger with commit_hash = ${{ github.event.inputs.commit_hash }}"
|
|
64
|
+
|
|
65
|
+
# Save current branch (detached HEAD at old commit)
|
|
66
|
+
ORIG_COMMIT=${{ github.event.inputs.commit_hash }}
|
|
67
|
+
|
|
68
|
+
# Fetch and checkout main
|
|
69
|
+
git fetch origin main
|
|
70
|
+
git checkout origin/main -- benchmark/
|
|
71
|
+
|
|
72
|
+
# Save benchmark folder from main
|
|
73
|
+
cp -r benchmark /tmp/benchmark_main
|
|
74
|
+
# Checkout back to target commit
|
|
75
|
+
git checkout $ORIG_COMMIT
|
|
76
|
+
# Replace old benchmark with one from main
|
|
77
|
+
rm -rf benchmark
|
|
78
|
+
cp -r /tmp/benchmark_main benchmark
|
|
79
|
+
|
|
80
|
+
# Step: Check if benchmark exists and exit if overwrite is false
|
|
81
|
+
- name: Check existing benchmark
|
|
82
|
+
run: |
|
|
83
|
+
COMMIT_HASH="${{ steps.choose_commit.outputs.hash }}"
|
|
84
|
+
BENCHMARK_URL="https://raw.githubusercontent.com/linkedin/Liger-Kernel/refs/heads/gh-pages/benchmarks/${COMMIT_HASH}/benchmark.csv"
|
|
85
|
+
|
|
86
|
+
if curl --output /dev/null --silent --head --fail "$BENCHMARK_URL"; then
|
|
87
|
+
echo "Benchmark already exists for commit $COMMIT_HASH"
|
|
88
|
+
if [ "${{ github.event.inputs.overwrite }}" != "true" ]; then
|
|
89
|
+
echo "Overwrite is false - exiting"
|
|
90
|
+
exit 1
|
|
91
|
+
else
|
|
92
|
+
echo "Overwrite is true - proceeding"
|
|
93
|
+
fi
|
|
94
|
+
else
|
|
95
|
+
echo "No existing benchmark found - proceeding"
|
|
96
|
+
fi
|
|
97
|
+
|
|
98
|
+
- name: Set up Python
|
|
99
|
+
uses: actions/setup-python@v3
|
|
100
|
+
with:
|
|
101
|
+
python-version: '3.10'
|
|
102
|
+
|
|
103
|
+
# Install dependencies
|
|
104
|
+
- name: Install dependencies
|
|
105
|
+
run: |
|
|
106
|
+
python -m pip install --upgrade pip
|
|
107
|
+
pip install modal
|
|
108
|
+
|
|
109
|
+
# Delete previous benchmark results.
|
|
110
|
+
- name: Remove previous benchmark data
|
|
111
|
+
run: |
|
|
112
|
+
rm -f benchmark/data/all_benchmark_data.csv
|
|
113
|
+
|
|
114
|
+
- name: Run benchmarks on GPU
|
|
115
|
+
run: |
|
|
116
|
+
modal run dev.modal.benchmarks
|
|
117
|
+
|
|
118
|
+
# Step 5: Checkout gh-pages branch in a subfolderAdd commentMore actions
|
|
119
|
+
- name: Checkout gh-pages
|
|
120
|
+
uses: actions/checkout@v3
|
|
121
|
+
with:
|
|
122
|
+
ref: gh-pages
|
|
123
|
+
path: gh-pages
|
|
124
|
+
|
|
125
|
+
# Step 6: Copy benchmark CSV to gh-pages directory
|
|
126
|
+
- name: Copy generated benchmark to gh-pages
|
|
127
|
+
id: copy_benchmark
|
|
128
|
+
run: |
|
|
129
|
+
if [[ "${{ github.event_name }}" == "release" ]]; then
|
|
130
|
+
echo "Release event detected"
|
|
131
|
+
path=${{steps.choose_commit.outputs.hash}}-${{ github.event.release.tag_name }}
|
|
132
|
+
else
|
|
133
|
+
echo "Not a release event"
|
|
134
|
+
path=${{steps.choose_commit.outputs.hash}}
|
|
135
|
+
fi
|
|
136
|
+
COMMIT_DIR="gh-pages/${OUTPUT_DIR}/${path}"
|
|
137
|
+
|
|
138
|
+
mkdir -p "$COMMIT_DIR"
|
|
139
|
+
|
|
140
|
+
if [ -f "$COMMIT_DIR/${OUTPUT_FILENAME}" ]; then
|
|
141
|
+
echo "Removing existing benchmark.csv for this commit"
|
|
142
|
+
rm "$COMMIT_DIR/${OUTPUT_FILENAME}"
|
|
143
|
+
fi
|
|
144
|
+
|
|
145
|
+
cp "${GENERATED_CSV}" "$COMMIT_DIR/${OUTPUT_FILENAME}"
|
|
146
|
+
|
|
147
|
+
# Step 7: Append commit hash to commits.txt if not already present
|
|
148
|
+
- name: Update commits.txt
|
|
149
|
+
run: |
|
|
150
|
+
cd gh-pages
|
|
151
|
+
echo "commits.txt file path: ${OUTPUT_DIR}/commits.txt"
|
|
152
|
+
# Create file if it doesn't exist
|
|
153
|
+
mkdir -p ${OUTPUT_DIR}
|
|
154
|
+
touch ${OUTPUT_DIR}/commits.txt
|
|
155
|
+
|
|
156
|
+
echo "${{ steps.copy_benchmark.outputs.path }}" >> ${OUTPUT_DIR}/commits.txt
|
|
157
|
+
|
|
158
|
+
echo "Added commit hash to commits.txt"
|
|
159
|
+
# Step 7: Commit and push
|
|
160
|
+
- name: Commit and push to gh-pages
|
|
161
|
+
run: |
|
|
162
|
+
cd gh-pages
|
|
163
|
+
git config user.name github-actions[bot]
|
|
164
|
+
git config user.email 41898282+github-actions[bot]@users.noreply.github.com
|
|
165
|
+
git add .
|
|
166
|
+
git commit -m "Add benchmark for commit ${{ steps.copy_benchmark.outputs.path }}" || echo "No changes to commit"
|
|
167
|
+
git push origin gh-pages
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: liger_kernel
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Efficient Triton kernels for LLM Training
|
|
5
5
|
License: BSD 2-CLAUSE LICENSE
|
|
6
6
|
Copyright 2024 LinkedIn Corporation
|
|
@@ -84,7 +84,7 @@ Dynamic: requires-dist
|
|
|
84
84
|
</td>
|
|
85
85
|
<td style="padding: 10px;">
|
|
86
86
|
<a href="https://discord.gg/gpumode">
|
|
87
|
-
<img src="https://dcbadge.
|
|
87
|
+
<img src="https://dcbadge.limes.pink/api/server/gpumode?style=flat" alt="Join Our Discord">
|
|
88
88
|
</a>
|
|
89
89
|
</td>
|
|
90
90
|
</tr>
|
|
@@ -307,7 +307,7 @@ loss.backward()
|
|
|
307
307
|
| Qwen2-VL, & QVQ | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl` | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
308
308
|
| Qwen2.5-VL | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_5_vl` | RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
309
309
|
| Qwen3 | `liger_kernel.transformers.apply_liger_kernel_to_qwen3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
310
|
-
| Qwen3 MoE | `
|
|
310
|
+
| Qwen3 MoE | `liger_kernel.transformers.apply_liger_kernel_to_qwen3_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
311
311
|
| Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
312
312
|
| Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
|
|
313
313
|
| OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
@@ -414,21 +414,19 @@ loss.backward()
|
|
|
414
414
|
|
|
415
415
|
- For issues, create a Github ticket in this repository
|
|
416
416
|
- For open discussion, join [our discord channel on GPUMode](https://discord.com/channels/1189498204333543425/1275130785933951039)
|
|
417
|
-
- For formal collaboration, send an email to yannchen@linkedin.com and
|
|
417
|
+
- For formal collaboration, send an email to Yanning Chen(yannchen@linkedin.com) and Zhipeng Wang(zhipwang@linkedin.com)
|
|
418
418
|
|
|
419
419
|
## Cite this work
|
|
420
420
|
|
|
421
421
|
Biblatex entry:
|
|
422
422
|
```bib
|
|
423
|
-
@
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
url={https://arxiv.org/abs/2410.10989},
|
|
431
|
-
journal={arXiv preprint arXiv:2410.10989},
|
|
423
|
+
@inproceedings{
|
|
424
|
+
hsu2025ligerkernel,
|
|
425
|
+
title={Liger-Kernel: Efficient Triton Kernels for {LLM} Training},
|
|
426
|
+
author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen and Zhipeng Wang},
|
|
427
|
+
booktitle={Championing Open-source DEvelopment in ML Workshop @ ICML25},
|
|
428
|
+
year={2025},
|
|
429
|
+
url={https://openreview.net/forum?id=36SjAIT42G}
|
|
432
430
|
}
|
|
433
431
|
```
|
|
434
432
|
|
|
@@ -32,7 +32,7 @@
|
|
|
32
32
|
</td>
|
|
33
33
|
<td style="padding: 10px;">
|
|
34
34
|
<a href="https://discord.gg/gpumode">
|
|
35
|
-
<img src="https://dcbadge.
|
|
35
|
+
<img src="https://dcbadge.limes.pink/api/server/gpumode?style=flat" alt="Join Our Discord">
|
|
36
36
|
</a>
|
|
37
37
|
</td>
|
|
38
38
|
</tr>
|
|
@@ -255,7 +255,7 @@ loss.backward()
|
|
|
255
255
|
| Qwen2-VL, & QVQ | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl` | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
256
256
|
| Qwen2.5-VL | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_5_vl` | RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
257
257
|
| Qwen3 | `liger_kernel.transformers.apply_liger_kernel_to_qwen3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
258
|
-
| Qwen3 MoE | `
|
|
258
|
+
| Qwen3 MoE | `liger_kernel.transformers.apply_liger_kernel_to_qwen3_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
259
259
|
| Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
260
260
|
| Granite 3.0 & 3.1 | `liger_kernel.transformers.apply_liger_kernel_to_granite` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
|
|
261
261
|
| OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
@@ -362,21 +362,19 @@ loss.backward()
|
|
|
362
362
|
|
|
363
363
|
- For issues, create a Github ticket in this repository
|
|
364
364
|
- For open discussion, join [our discord channel on GPUMode](https://discord.com/channels/1189498204333543425/1275130785933951039)
|
|
365
|
-
- For formal collaboration, send an email to yannchen@linkedin.com and
|
|
365
|
+
- For formal collaboration, send an email to Yanning Chen(yannchen@linkedin.com) and Zhipeng Wang(zhipwang@linkedin.com)
|
|
366
366
|
|
|
367
367
|
## Cite this work
|
|
368
368
|
|
|
369
369
|
Biblatex entry:
|
|
370
370
|
```bib
|
|
371
|
-
@
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
url={https://arxiv.org/abs/2410.10989},
|
|
379
|
-
journal={arXiv preprint arXiv:2410.10989},
|
|
371
|
+
@inproceedings{
|
|
372
|
+
hsu2025ligerkernel,
|
|
373
|
+
title={Liger-Kernel: Efficient Triton Kernels for {LLM} Training},
|
|
374
|
+
author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen and Zhipeng Wang},
|
|
375
|
+
booktitle={Championing Open-source DEvelopment in ML Workshop @ ICML25},
|
|
376
|
+
year={2025},
|
|
377
|
+
url={https://openreview.net/forum?id=36SjAIT42G}
|
|
380
378
|
}
|
|
381
379
|
```
|
|
382
380
|
|
|
@@ -625,36 +625,6 @@ group_norm,huggingface,backward,memory,MB,C,num_channels,256,320.5078125,320.507
|
|
|
625
625
|
group_norm,huggingface,backward,memory,MB,C,num_channels,512,641.015625,641.015625,641.015625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
|
|
626
626
|
group_norm,huggingface,backward,memory,MB,C,num_channels,1024,1282.03125,1282.03125,1282.03125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
|
|
627
627
|
group_norm,huggingface,backward,memory,MB,C,num_channels,2048,2564.0625,2564.0625,2564.0625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:20:53,0.3.1
|
|
628
|
-
layer_norm,liger,forward,speed,ms,N,hidden size,1024,0.035840000957250595,0.03481600061058998,0.035840000957250595,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
|
|
629
|
-
layer_norm,liger,forward,speed,ms,N,hidden size,2048,0.05939200147986412,0.058368001133203506,0.060416001826524734,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
|
|
630
|
-
layer_norm,liger,forward,speed,ms,N,hidden size,4096,0.10751999914646149,0.10751999914646149,0.1085439994931221,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
|
|
631
|
-
layer_norm,liger,forward,speed,ms,N,hidden size,8192,0.20582400262355804,0.20479999482631683,0.20684799551963806,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
|
|
632
|
-
layer_norm,liger,forward,speed,ms,N,hidden size,16384,0.3993600010871887,0.3983359932899475,0.40140798687934875,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:51,0.3.1
|
|
633
|
-
layer_norm,huggingface,forward,speed,ms,N,hidden size,1024,0.03788800165057182,0.03788800165057182,0.03891199827194214,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
|
|
634
|
-
layer_norm,huggingface,forward,speed,ms,N,hidden size,2048,0.0655359998345375,0.0655359998345375,0.06656000018119812,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
|
|
635
|
-
layer_norm,huggingface,forward,speed,ms,N,hidden size,4096,0.14745600521564484,0.14643199741840363,0.14847999811172485,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
|
|
636
|
-
layer_norm,huggingface,forward,speed,ms,N,hidden size,8192,0.31334400177001953,0.3123199939727783,0.31436800956726074,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
|
|
637
|
-
layer_norm,huggingface,forward,speed,ms,N,hidden size,16384,0.6133760213851929,0.6123520135879517,0.6154239773750305,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:27:53,0.3.1
|
|
638
|
-
layer_norm,liger,full,speed,ms,N,hidden size,1024,0.6860799789428711,0.6146048903465271,0.7049216032028198,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
|
|
639
|
-
layer_norm,liger,full,speed,ms,N,hidden size,2048,0.6789119839668274,0.6737920045852661,0.6912000179290771,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
|
|
640
|
-
layer_norm,liger,full,speed,ms,N,hidden size,4096,0.6686720252037048,0.6635519862174988,0.681984007358551,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
|
|
641
|
-
layer_norm,liger,full,speed,ms,N,hidden size,8192,0.6789119839668274,0.5908480286598206,0.6932479739189148,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
|
|
642
|
-
layer_norm,liger,full,speed,ms,N,hidden size,16384,6.071296215057373,5.331148624420166,6.08235502243042,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:02,0.3.1
|
|
643
|
-
layer_norm,huggingface,full,speed,ms,N,hidden size,1024,0.13312000036239624,0.13209599256515503,0.13312000036239624,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
|
644
|
-
layer_norm,huggingface,full,speed,ms,N,hidden size,2048,0.23244799673557281,0.2303999960422516,0.23347200453281403,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
|
645
|
-
layer_norm,huggingface,full,speed,ms,N,hidden size,4096,0.5242879986763,0.5232639908790588,0.5263360142707825,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
|
646
|
-
layer_norm,huggingface,full,speed,ms,N,hidden size,8192,1.0168319940567017,1.0147839784622192,1.018880009651184,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
|
647
|
-
layer_norm,huggingface,full,speed,ms,N,hidden size,16384,1.994752049446106,1.9916800260543823,1.9967999458312988,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
|
648
|
-
layer_norm,liger,full,memory,MB,N,hidden size,1024,80.90625,80.90625,80.90625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
|
649
|
-
layer_norm,liger,full,memory,MB,N,hidden size,2048,161.78125,161.78125,161.78125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
|
650
|
-
layer_norm,liger,full,memory,MB,N,hidden size,4096,323.53125,323.53125,323.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
|
651
|
-
layer_norm,liger,full,memory,MB,N,hidden size,8192,647.03125,647.03125,647.03125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
|
652
|
-
layer_norm,liger,full,memory,MB,N,hidden size,16384,1294.03125,1294.03125,1294.03125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:04,0.3.1
|
|
653
|
-
layer_norm,huggingface,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
|
|
654
|
-
layer_norm,huggingface,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
|
|
655
|
-
layer_norm,huggingface,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
|
|
656
|
-
layer_norm,huggingface,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
|
|
657
|
-
layer_norm,huggingface,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-40GB,2024-11-05 19:28:05,0.3.1
|
|
658
628
|
fused_linear_orpo_loss,liger,forward,speed,ms,B,B,2,116.00621032714844,116.00621032714844,116.00621032714844,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
|
|
659
629
|
fused_linear_orpo_loss,liger,forward,speed,ms,B,B,4,230.83609008789062,230.83609008789062,230.83609008789062,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
|
|
660
630
|
fused_linear_orpo_loss,liger,forward,speed,ms,B,B,8,461.9543151855469,461.9543151855469,461.9543151855469,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
|
|
@@ -1493,3 +1463,115 @@ distill_cosine_loss,torch,full,memory,MB,BT,B x T,1024,7566.2822265625,7566.2822
|
|
|
1493
1463
|
distill_cosine_loss,torch,full,memory,MB,BT,B x T,2048,11590.3134765625,11590.3134765625,11590.3134765625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
|
|
1494
1464
|
distill_cosine_loss,torch,full,memory,MB,BT,B x T,4096,19654.375,19654.375,19654.375,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
|
|
1495
1465
|
distill_cosine_loss,torch,full,memory,MB,BT,B x T,8192,35782.5,35782.5,35782.5,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
|
|
1466
|
+
layer_norm,liger,forward,speed,ms,N,hidden size,1024,0.018848000094294548,0.018400000408291817,0.020102400332689285,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
|
|
1467
|
+
layer_norm,liger,forward,speed,ms,N,hidden size,2048,0.029152000322937965,0.02876799926161766,0.029823999851942062,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
|
|
1468
|
+
layer_norm,liger,forward,speed,ms,N,hidden size,4096,0.05104000121355057,0.05036799982190132,0.05177599936723709,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
|
|
1469
|
+
layer_norm,liger,forward,speed,ms,N,hidden size,8192,0.0947519987821579,0.09436800330877304,0.09507200121879578,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
|
|
1470
|
+
layer_norm,liger,forward,speed,ms,N,hidden size,16384,0.18476800620555878,0.18396799266338348,0.1852159947156906,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
|
|
1471
|
+
layer_norm,huggingface,forward,speed,ms,N,hidden size,1024,0.023584000766277313,0.023423999547958374,0.023840000852942467,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
|
|
1472
|
+
layer_norm,huggingface,forward,speed,ms,N,hidden size,2048,0.03734400123357773,0.03702399879693985,0.037811201065778746,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
|
|
1473
|
+
layer_norm,huggingface,forward,speed,ms,N,hidden size,4096,0.06617599725723267,0.06560000032186508,0.06678400188684464,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
|
|
1474
|
+
layer_norm,huggingface,forward,speed,ms,N,hidden size,8192,0.15267199277877808,0.15190400183200836,0.15347200632095337,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
|
|
1475
|
+
layer_norm,huggingface,forward,speed,ms,N,hidden size,16384,0.3067840039730072,0.3046143889427185,0.3081152021884918,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
|
|
1476
|
+
layer_norm,liger,backward,speed,ms,N,hidden size,1024,0.12006399780511856,0.11653760075569153,0.12467200309038162,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
|
|
1477
|
+
layer_norm,liger,backward,speed,ms,N,hidden size,2048,0.1207360029220581,0.1176128014922142,0.1256511986255646,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
|
|
1478
|
+
layer_norm,liger,backward,speed,ms,N,hidden size,4096,0.16630400717258453,0.16412800550460815,0.16838400065898895,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
|
|
1479
|
+
layer_norm,liger,backward,speed,ms,N,hidden size,8192,0.31279999017715454,0.31116798520088196,0.3145279884338379,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
|
|
1480
|
+
layer_norm,liger,backward,speed,ms,N,hidden size,16384,0.5776320099830627,0.5753471970558167,0.5798912048339844,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
|
|
1481
|
+
layer_norm,huggingface,backward,speed,ms,N,hidden size,1024,0.0605119988322258,0.059647999703884125,0.061344001442193985,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
|
|
1482
|
+
layer_norm,huggingface,backward,speed,ms,N,hidden size,2048,0.09967999905347824,0.09849599748849869,0.10099200159311295,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
|
|
1483
|
+
layer_norm,huggingface,backward,speed,ms,N,hidden size,4096,0.17881600558757782,0.17795200645923615,0.17971199750900269,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
|
|
1484
|
+
layer_norm,huggingface,backward,speed,ms,N,hidden size,8192,0.33369600772857666,0.3328000009059906,0.33478400111198425,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
|
|
1485
|
+
layer_norm,huggingface,backward,speed,ms,N,hidden size,16384,0.6424000263214111,0.6412223815917969,0.643455982208252,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
|
|
1486
|
+
layer_norm,liger,full,speed,ms,N,hidden size,1024,0.26576000452041626,0.2629248082637787,0.2701759934425354,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
|
|
1487
|
+
layer_norm,liger,full,speed,ms,N,hidden size,2048,0.27427199482917786,0.26999040842056277,0.28091518878936766,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
|
|
1488
|
+
layer_norm,liger,full,speed,ms,N,hidden size,4096,0.27454400062561035,0.27004799246788025,0.2807359993457794,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
|
|
1489
|
+
layer_norm,liger,full,speed,ms,N,hidden size,8192,0.40556800365448,0.40403199195861816,0.40723198652267456,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
|
|
1490
|
+
layer_norm,liger,full,speed,ms,N,hidden size,16384,0.7608960270881653,0.7589311957359314,0.7631679773330688,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
|
|
1491
|
+
layer_norm,huggingface,full,speed,ms,N,hidden size,1024,0.08025600016117096,0.07942400127649307,0.08111999928951263,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1492
|
+
layer_norm,huggingface,full,speed,ms,N,hidden size,2048,0.13315199315547943,0.13180799782276154,0.13468800485134125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1493
|
+
layer_norm,huggingface,full,speed,ms,N,hidden size,4096,0.2417600005865097,0.24089600145816803,0.24262399971485138,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1494
|
+
layer_norm,huggingface,full,speed,ms,N,hidden size,8192,0.4832639992237091,0.48214399814605713,0.4843647956848145,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1495
|
+
layer_norm,huggingface,full,speed,ms,N,hidden size,16384,0.950575977563858,0.9484800100326538,0.9528064012527466,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1496
|
+
layer_norm,liger,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1497
|
+
layer_norm,liger,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1498
|
+
layer_norm,liger,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1499
|
+
layer_norm,liger,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1500
|
+
layer_norm,liger,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1501
|
+
layer_norm,huggingface,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1502
|
+
layer_norm,huggingface,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1503
|
+
layer_norm,huggingface,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1504
|
+
layer_norm,huggingface,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1505
|
+
layer_norm,huggingface,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
|
|
1506
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,1024,0.01759999990463257,0.017311999574303627,0.017920000478625298,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
|
|
1507
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,2048,0.02924799919128418,0.028863999992609024,0.029983999207615852,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
|
|
1508
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,4096,0.05129599943757057,0.050624001771211624,0.05209600180387497,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
|
|
1509
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,8192,0.09344000369310379,0.09296000003814697,0.09382399916648865,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
|
|
1510
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,16384,0.1791680008172989,0.17814399302005768,0.1796800047159195,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
|
|
1511
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,32768,0.43830400705337524,0.43744000792503357,0.43929600715637207,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
|
|
1512
|
+
fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,1024,0.060095999389886856,0.059808000922203064,0.06054399907588959,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
|
|
1513
|
+
fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,2048,0.09084799885749817,0.09027200192213058,0.09161599725484848,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
|
|
1514
|
+
fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,4096,0.17820799350738525,0.17744000256061554,0.17897599935531616,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
|
|
1515
|
+
fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,8192,0.312608003616333,0.3118720054626465,0.31324800848960876,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
|
|
1516
|
+
fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,16384,0.574944019317627,0.5740479826927185,0.5756288051605225,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
|
|
1517
|
+
fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,32768,1.0943039655685425,1.0934272289276123,1.0951999425888062,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
|
|
1518
|
+
fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,1024,0.0352960005402565,0.03481600061058998,0.03811199963092804,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
|
|
1519
|
+
fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,2048,0.05430399999022484,0.05392000079154968,0.05503999814391136,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
|
|
1520
|
+
fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,4096,0.10592000186443329,0.1054655984044075,0.10630399733781815,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
|
|
1521
|
+
fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,8192,0.19679999351501465,0.19631999731063843,0.19724799692630768,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
|
|
1522
|
+
fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,16384,0.37436801195144653,0.3733760118484497,0.3752320110797882,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
|
|
1523
|
+
fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,32768,0.7376000285148621,0.7361343741416931,0.7391359806060791,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
|
|
1524
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,1024,0.3147200047969818,0.30796160697937014,0.32764801383018494,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
|
|
1525
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,2048,0.3089919984340668,0.30374398827552795,0.3226880133152008,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
|
|
1526
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,4096,0.30691200494766235,0.3023296058177948,0.3205504059791565,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
|
|
1527
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,8192,0.3246079981327057,0.3185984075069428,0.33656961321830753,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
|
|
1528
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,16384,0.6010559797286987,0.5996800065040588,0.6026239991188049,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
|
|
1529
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,32768,1.8402559757232666,1.8322880268096924,1.8461120128631592,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
|
|
1530
|
+
fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,1024,0.23878400027751923,0.23545600473880768,0.2507520020008087,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
|
|
1531
|
+
fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,2048,0.34513600170612335,0.34377598762512207,0.34678399562835693,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
|
|
1532
|
+
fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,4096,0.6330879926681519,0.631712019443512,0.6345599889755249,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
|
|
1533
|
+
fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,8192,1.1185599565505981,1.1172800064086914,1.1196800470352173,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
|
|
1534
|
+
fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,16384,2.0697600841522217,2.0678528785705566,2.0713536739349365,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
|
|
1535
|
+
fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,32768,3.9561920166015625,3.953824043273926,3.9581120014190674,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
|
|
1536
|
+
fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,1024,0.38916800916194916,0.3824320137500763,0.4037184059619903,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
|
|
1537
|
+
fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,2048,0.3890720009803772,0.38193280100822447,0.4032831907272339,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
|
|
1538
|
+
fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,4096,0.39715200662612915,0.3928639888763428,0.41097599267959595,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
|
|
1539
|
+
fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,8192,0.6275200247764587,0.6259520053863525,0.6287999749183655,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
|
|
1540
|
+
fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,16384,1.202239990234375,1.199679970741272,1.2048959732055664,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
|
|
1541
|
+
fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,32768,2.7738559246063232,2.7705343723297116,2.777868890762329,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
|
|
1542
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,1024,0.15619200468063354,0.15376000106334686,0.1661248028278351,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
|
|
1543
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,2048,0.15825600177049637,0.15600000321865082,0.16911999881267548,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
|
|
1544
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,4096,0.16700799763202667,0.16502399742603302,0.1709440052509308,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
|
|
1545
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,8192,0.1712000072002411,0.1700800061225891,0.17215999960899353,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
|
|
1546
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,16384,0.42505601048469543,0.4233280122280121,0.42691200971603394,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
|
|
1547
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,32768,1.4057759642601013,1.3944000005722046,1.4099839925765991,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
|
|
1548
|
+
fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,1024,0.1520960032939911,0.15136000514030457,0.1528960019350052,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
|
|
1549
|
+
fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,2048,0.2533760070800781,0.2524160146713257,0.25436800718307495,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
|
|
1550
|
+
fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,4096,0.4551039934158325,0.4540799856185913,0.45612800121307373,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
|
|
1551
|
+
fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,8192,0.8053439855575562,0.8038079738616943,0.806656002998352,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
|
|
1552
|
+
fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,16384,1.4933120012283325,1.492095947265625,1.49452805519104,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
|
|
1553
|
+
fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,32768,2.8600640296936035,2.8583295822143557,2.8612607955932616,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
|
|
1554
|
+
fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,1024,0.20175999402999878,0.199072003364563,0.2154303938150406,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1555
|
+
fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,2048,0.20263999700546265,0.20000000298023224,0.21675519943237304,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1556
|
+
fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,4096,0.25276800990104675,0.2515519857406616,0.2539199888706207,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1557
|
+
fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,8192,0.4322720021009445,0.43088001012802124,0.4336000084877014,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1558
|
+
fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,16384,0.8288000226020813,0.8266303777694701,0.8311295866966247,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1559
|
+
fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,32768,2.03987193107605,2.0360767364501955,2.0436416149139403,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1560
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,1024,72.546875,72.546875,72.546875,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1561
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,2048,145.0859375,145.0859375,145.0859375,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1562
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,4096,290.1640625,290.1640625,290.1640625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1563
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,8192,580.3203125,580.3203125,580.3203125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1564
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,16384,1160.6328125,1160.6328125,1160.6328125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1565
|
+
fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,32768,2321.2578125,2321.2578125,2321.2578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1566
|
+
fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,1024,104.03173828125,104.03173828125,104.03173828125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1567
|
+
fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,2048,208.05517578125,208.05517578125,208.05517578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1568
|
+
fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,4096,416.10205078125,416.10205078125,416.10205078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1569
|
+
fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,8192,832.19580078125,832.19580078125,832.19580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1570
|
+
fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,16384,1664.3125,1664.3125,1664.3125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1571
|
+
fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,32768,3328.625,3328.625,3328.625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1572
|
+
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,1024,104.03564453125,104.03564453125,104.03564453125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1573
|
+
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,2048,208.06298828125,208.06298828125,208.06298828125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1574
|
+
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,4096,416.11767578125,416.11767578125,416.11767578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1575
|
+
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,8192,832.22705078125,832.22705078125,832.22705078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1576
|
+
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,16384,1544.44580078125,1544.44580078125,1544.44580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
1577
|
+
fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
|
|
@@ -36,17 +36,20 @@ def bench_memory_fused_linear_cpo_loss(
|
|
|
36
36
|
dtype = input.extra_benchmark_config["dtype"]
|
|
37
37
|
provider = input.kernel_provider
|
|
38
38
|
|
|
39
|
-
|
|
40
|
-
|
|
39
|
+
# Instantiate once and retrieve the first output only
|
|
40
|
+
torch_lm_head_cpo = TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
|
|
41
|
+
liger_lm_head_cpo = LigerLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
|
|
42
|
+
torch_fwd = lambda x, target: torch_lm_head_cpo(x, target)[0]
|
|
43
|
+
liger_fwd = lambda x, target: liger_lm_head_cpo(x, target)[0]
|
|
41
44
|
|
|
42
45
|
_input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
|
|
43
46
|
target = torch.randint(V, (B, T), dtype=torch.long, device=device)
|
|
44
47
|
|
|
45
48
|
def fwd():
|
|
46
49
|
if provider == "liger":
|
|
47
|
-
return
|
|
50
|
+
return liger_fwd(_input, target)
|
|
48
51
|
elif provider == "huggingface":
|
|
49
|
-
return
|
|
52
|
+
return torch_fwd(_input, target)
|
|
50
53
|
|
|
51
54
|
def full():
|
|
52
55
|
y = fwd()
|
|
@@ -79,17 +82,20 @@ def bench_speed_fused_linear_cpo_loss(
|
|
|
79
82
|
provider = input.kernel_provider
|
|
80
83
|
mode = input.kernel_operation_mode
|
|
81
84
|
|
|
82
|
-
|
|
83
|
-
|
|
85
|
+
# Instantiate once and retrieve the first output only
|
|
86
|
+
torch_lm_head_cpo = TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
|
|
87
|
+
liger_lm_head_cpo = LigerLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
|
|
88
|
+
torch_fwd = lambda x, target: torch_lm_head_cpo(x, target)[0]
|
|
89
|
+
liger_fwd = lambda x, target: liger_lm_head_cpo(x, target)[0]
|
|
84
90
|
|
|
85
91
|
_input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
|
|
86
92
|
target = torch.randint(V, (B, T), dtype=torch.long, device=device)
|
|
87
93
|
|
|
88
94
|
def fwd():
|
|
89
95
|
if provider == "liger":
|
|
90
|
-
return
|
|
96
|
+
return liger_fwd(_input, target)
|
|
91
97
|
elif provider == "huggingface":
|
|
92
|
-
return
|
|
98
|
+
return torch_fwd(_input, target)
|
|
93
99
|
|
|
94
100
|
if mode == "forward":
|
|
95
101
|
ms_50, ms_20, ms_80 = triton.testing.do_bench(
|
|
@@ -32,12 +32,11 @@ def bench_memory_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunO
|
|
|
32
32
|
ignore_index = input.extra_benchmark_config["ignore_index"]
|
|
33
33
|
provider = input.kernel_provider
|
|
34
34
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
).to(device)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
).to(device)(x, ref_x, target)[0]
|
|
35
|
+
# Instantiate once and retrieve the first output only
|
|
36
|
+
torch_dpo_loss = TorchLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
|
|
37
|
+
liger_dpo_loss = LigerLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
|
|
38
|
+
torch_fwd = lambda x, ref_x, target: torch_dpo_loss(x, ref_x, target)[0]
|
|
39
|
+
liger_fwd = lambda x, ref_x, target: liger_dpo_loss(x, ref_x, target)[0]
|
|
41
40
|
|
|
42
41
|
# Input shape: [B, T, H]
|
|
43
42
|
_input = torch.randn(B, T, H, device=device, dtype=dtype)
|
|
@@ -52,9 +51,9 @@ def bench_memory_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunO
|
|
|
52
51
|
|
|
53
52
|
def fwd():
|
|
54
53
|
if provider == "liger":
|
|
55
|
-
return
|
|
54
|
+
return liger_fwd(_input, ref_input, target)
|
|
56
55
|
elif provider == "huggingface":
|
|
57
|
-
return
|
|
56
|
+
return torch_fwd(_input, ref_input, target)
|
|
58
57
|
|
|
59
58
|
def full():
|
|
60
59
|
y = fwd()
|
|
@@ -83,12 +82,11 @@ def bench_speed_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOu
|
|
|
83
82
|
provider = input.kernel_provider
|
|
84
83
|
mode = input.kernel_operation_mode
|
|
85
84
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
).to(device)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
).to(device)(x, ref_x, target)[0]
|
|
85
|
+
# Instantiate once and retrieve the first output only
|
|
86
|
+
torch_dpo_loss = TorchLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
|
|
87
|
+
liger_dpo_loss = LigerLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
|
|
88
|
+
torch_fwd = lambda x, ref_x, target: torch_dpo_loss(x, ref_x, target)[0]
|
|
89
|
+
liger_fwd = lambda x, ref_x, target: liger_dpo_loss(x, ref_x, target)[0]
|
|
92
90
|
|
|
93
91
|
# Input shape: [B, T, H]
|
|
94
92
|
_input = torch.randn(B, T, H, device=device, dtype=dtype)
|
|
@@ -103,9 +101,9 @@ def bench_speed_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOu
|
|
|
103
101
|
|
|
104
102
|
def fwd():
|
|
105
103
|
if provider == "liger":
|
|
106
|
-
return
|
|
104
|
+
return liger_fwd(_input, ref_input, target)
|
|
107
105
|
elif provider == "huggingface":
|
|
108
|
-
return
|
|
106
|
+
return torch_fwd(_input, ref_input, target)
|
|
109
107
|
|
|
110
108
|
if mode == "forward":
|
|
111
109
|
ms_50, ms_20, ms_80 = triton.testing.do_bench(
|
|
@@ -48,6 +48,14 @@ def bench_speed_embedding(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunO
|
|
|
48
48
|
|
|
49
49
|
if mode == "forward":
|
|
50
50
|
ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, quantiles=QUANTILES, rep=100)
|
|
51
|
+
elif mode == "backward":
|
|
52
|
+
output = fwd()
|
|
53
|
+
ms_50, ms_20, ms_80 = triton.testing.do_bench(
|
|
54
|
+
lambda: output.backward(torch.randn_like(output), retain_graph=True),
|
|
55
|
+
quantiles=QUANTILES,
|
|
56
|
+
grad_to_none=[input_ids],
|
|
57
|
+
rep=100,
|
|
58
|
+
)
|
|
51
59
|
elif mode == "full":
|
|
52
60
|
ms_50, ms_20, ms_80 = triton.testing.do_bench(full, quantiles=QUANTILES, rep=100)
|
|
53
61
|
return SingleBenchmarkRunOutput(
|