liger-kernel-nightly 0.6.2.dev20251013144132__tar.gz → 0.6.3.dev20251118154655__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of liger-kernel-nightly might be problematic. Click here for more details.
- liger_kernel_nightly-0.6.3.dev20251118154655/.github/workflows/docs.yml +64 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/PKG-INFO +1 -1
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/data/all_benchmark_data.csv +240 -0
- liger_kernel_nightly-0.6.3.dev20251118154655/benchmark/scripts/benchmark_poly_norm.py +197 -0
- liger_kernel_nightly-0.6.3.dev20251118154655/benchmark/scripts/benchmark_tiled_mlp.py +397 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/acknowledgement.md +0 -1
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/pyproject.toml +1 -1
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +13 -4
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +13 -2
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/jsd_loss.py +18 -5
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/cross_entropy.py +63 -10
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/fused_linear_cross_entropy.py +43 -13
- liger_kernel_nightly-0.6.3.dev20251118154655/src/liger_kernel/ops/poly_norm.py +386 -0
- liger_kernel_nightly-0.6.3.dev20251118154655/src/liger_kernel/ops/tiled_mlp.py +136 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/__init__.py +18 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/cross_entropy.py +8 -3
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/functional.py +29 -6
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +8 -3
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/falcon_h1.py +19 -5
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/gemma.py +17 -6
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/gemma2.py +14 -5
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/gemma3.py +25 -12
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/glm4.py +16 -4
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/glm4v.py +16 -4
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/glm4v_moe.py +23 -4
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/internvl.py +12 -5
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/llama.py +14 -5
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/llama4.py +16 -4
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/llava.py +12 -4
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/loss_utils.py +31 -3
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/mistral.py +15 -6
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/mixtral.py +16 -7
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/mllama.py +12 -4
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/olmo2.py +16 -4
- liger_kernel_nightly-0.6.3.dev20251118154655/src/liger_kernel/transformers/model/output_classes.py +147 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/paligemma.py +22 -5
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/phi3.py +14 -7
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/qwen2.py +16 -3
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/qwen2_5_vl.py +14 -6
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/qwen2_vl.py +16 -4
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/qwen3.py +18 -5
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/qwen3_moe.py +19 -5
- liger_kernel_nightly-0.6.3.dev20251118154655/src/liger_kernel/transformers/model/qwen3_next.py +146 -0
- liger_kernel_nightly-0.6.3.dev20251118154655/src/liger_kernel/transformers/model/qwen3_vl.py +150 -0
- liger_kernel_nightly-0.6.3.dev20251118154655/src/liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/smollm3.py +15 -6
- liger_kernel_nightly-0.6.3.dev20251118154655/src/liger_kernel/transformers/model/smolvlm.py +158 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/monkey_patch.py +401 -17
- liger_kernel_nightly-0.6.3.dev20251118154655/src/liger_kernel/transformers/poly_norm.py +42 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/rms_norm.py +7 -0
- liger_kernel_nightly-0.6.3.dev20251118154655/src/liger_kernel/transformers/rope.py +63 -0
- liger_kernel_nightly-0.6.3.dev20251118154655/src/liger_kernel/transformers/tiled_mlp.py +133 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel_nightly.egg-info/PKG-INFO +1 -1
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel_nightly.egg-info/SOURCES.txt +15 -0
- liger_kernel_nightly-0.6.3.dev20251118154655/test/conftest.py +11 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/convergence/bf16/test_mini_models.py +259 -8
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/convergence/bf16/test_mini_models_multimodal.py +314 -1
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/convergence/bf16/test_mini_models_with_logits.py +256 -4
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/convergence/fp32/test_mini_models.py +253 -4
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/convergence/fp32/test_mini_models_multimodal.py +452 -1
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/convergence/fp32/test_mini_models_with_logits.py +270 -1
- liger_kernel_nightly-0.6.3.dev20251118154655/test/resources/fake_configs/HuggingFaceTB/SmolVLM2-256M-Video-Instruct/tokenizer_config.json +1192 -0
- liger_kernel_nightly-0.6.3.dev20251118154655/test/resources/fake_configs/Qwen/Qwen3-VL-4B-Instruct/tokenizer_config.json +63 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_cross_entropy.py +81 -6
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_fused_linear_cross_entropy.py +229 -5
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_monkey_patch.py +795 -13
- liger_kernel_nightly-0.6.3.dev20251118154655/test/transformers/test_poly_norm.py +281 -0
- liger_kernel_nightly-0.6.3.dev20251118154655/test/transformers/test_tiled_mlp.py +216 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/utils.py +67 -0
- liger_kernel_nightly-0.6.2.dev20251013144132/.github/workflows/docs.yml +0 -33
- liger_kernel_nightly-0.6.2.dev20251013144132/src/liger_kernel/transformers/rope.py +0 -20
- liger_kernel_nightly-0.6.2.dev20251013144132/test/conftest.py +0 -8
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/.github/pull_request_template.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/.github/workflows/amd-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/.github/workflows/benchmark.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/.github/workflows/intel-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/.github/workflows/nvi-ci.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/.github/workflows/publish-nightly.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/.github/workflows/publish-release.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/.gitignore +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/LICENSE +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/Makefile +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/NOTICE +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/benchmarks_visualizer.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/benchmark_tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/benchmark/scripts/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/dev/fmt-requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/dev/modal/benchmarks.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/dev/modal/tests.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/dev/modal/tests_bwd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/Examples.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/Getting-Started.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/High-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/Low-Level-APIs.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/contributing.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/images/banner.GIF +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/images/compose.gif +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/images/e2e-memory.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/images/e2e-tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/images/logo-banner.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/images/patch.gif +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/images/post-training.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/index.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/docs/license.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/alignment/accelerate_config.yaml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/alignment/run_orpo.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/callback.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/config/fsdp_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/img/gemma_7b_mem.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/img/gemma_7b_tp.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/img/llama_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/img/llama_tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/img/qwen_tps.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/launch_on_modal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/run_benchmarks.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/run_gemma.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/run_llama.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/run_qwen.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/run_qwen2_vl.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/training.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/huggingface/training_multimodal.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/lightning/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/lightning/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/lightning/training.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/callback.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/medusa_util.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/requirements.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/examples/medusa/train.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/licenses/LICENSE-Apache-2.0 +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/licenses/LICENSE-MIT-AutoAWQ +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/licenses/LICENSE-MIT-llmc +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/licenses/LICENSE-MIT-triton +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/mkdocs.yml +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/setup.cfg +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/setup.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/README.md +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/functional.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/env_report.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/ops/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/auto_model.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/experimental/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/fsdp.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/llama4_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/model/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/trainer_integration.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/transformers/tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/triton/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/triton/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel/utils.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/chunked_loss/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/chunked_loss/test_cosine_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/chunked_loss/test_cpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/chunked_loss/test_dpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/chunked_loss/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/chunked_loss/test_jsd_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/chunked_loss/test_kto_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/chunked_loss/test_orpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/chunked_loss/test_simpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/convergence/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/convergence/bf16/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/convergence/fp32/__init__.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/fake_configs/OpenGVLab/InternVL3-1B-hf/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/tiny_shakespeare.txt +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_auto_model.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_dyt.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_embedding.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_flex_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_fused_add_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_fused_neighborhood_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_geglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_group_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_grpo_loss.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_jsd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_kl_div.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_layer_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_multi_token_attention.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_qwen2vl_mrope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_rms_norm.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_rope.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_softmax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_sparsemax.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_swiglu.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_trainer_integration.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_transformers.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/transformers/test_tvd.py +0 -0
- {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251118154655}/test/triton/test_triton_monkey_patch.py +0 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
name: Publish documentation
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
branches:
|
|
5
|
+
- main
|
|
6
|
+
paths:
|
|
7
|
+
- 'docs/**'
|
|
8
|
+
- 'mkdocs.yml'
|
|
9
|
+
|
|
10
|
+
permissions:
|
|
11
|
+
contents: write
|
|
12
|
+
jobs:
|
|
13
|
+
deploy:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- name: Configure Git Credentials
|
|
18
|
+
run: |
|
|
19
|
+
git config user.name github-actions[bot]
|
|
20
|
+
git config user.email 41898282+github-actions[bot]@users.noreply.github.com
|
|
21
|
+
- uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: 3.x
|
|
24
|
+
- run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
|
|
25
|
+
- uses: actions/cache@v4
|
|
26
|
+
with:
|
|
27
|
+
key: mkdocs-material-${{ env.cache_id }}
|
|
28
|
+
path: .cache
|
|
29
|
+
restore-keys: |
|
|
30
|
+
mkdocs-material-
|
|
31
|
+
- run: pip install mkdocs-material mkdocstrings[python]
|
|
32
|
+
# ====== Backup the benchmarks from gh-pages ======
|
|
33
|
+
# This is necessary because the benchmarks are not included in the documentation build process.
|
|
34
|
+
# So we need to backup the benchmarks from gh-pages and restore them after the documentation is built.
|
|
35
|
+
- name: Backup benchmarks from gh-pages
|
|
36
|
+
run: |
|
|
37
|
+
git fetch origin gh-pages
|
|
38
|
+
# create worktree bound to local gh-pages, tracking origin/gh-pages
|
|
39
|
+
git branch -f gh-pages origin/gh-pages || true
|
|
40
|
+
mkdir -p ghp && git worktree add ghp gh-pages || true
|
|
41
|
+
if [ -d ghp/benchmarks ]; then
|
|
42
|
+
tar -C ghp -czf /tmp/benchmarks.tgz benchmarks
|
|
43
|
+
fi
|
|
44
|
+
# IMPORTANT: remove worktree so gh-pages isn't checked out anywhere
|
|
45
|
+
git worktree remove ghp --force || true
|
|
46
|
+
echo "Backed up benchmarks from gh-pages"
|
|
47
|
+
# ====== Deploy the documentation ======
|
|
48
|
+
- name: Deploy documentation
|
|
49
|
+
run: mkdocs gh-deploy --force
|
|
50
|
+
# ====== Restore the benchmarks onto gh-pages ======
|
|
51
|
+
# This is necessary because the benchmarks are not included in the documentation build process.
|
|
52
|
+
# So we need to restore the benchmarks onto gh-pages after the documentation is built.
|
|
53
|
+
- name: Restore benchmarks onto gh-pages
|
|
54
|
+
run: |
|
|
55
|
+
# Refresh remote tracking and recreate a clean worktree
|
|
56
|
+
git fetch origin gh-pages
|
|
57
|
+
git worktree add -B gh-pages ghp origin/gh-pages
|
|
58
|
+
if [ -f /tmp/benchmarks.tgz ]; then
|
|
59
|
+
tar -C ghp -xzf /tmp/benchmarks.tgz
|
|
60
|
+
git -C ghp add -A
|
|
61
|
+
git -C ghp commit -m "Restore benchmarks after gh-deploy" || echo "No changes"
|
|
62
|
+
git -C ghp push origin gh-pages
|
|
63
|
+
fi
|
|
64
|
+
git worktree remove ghp --force || true
|
|
@@ -1703,3 +1703,243 @@ llama4_rope,huggingface,full,memory,MB,T,sequence length,2048,314.01611328125,31
|
|
|
1703
1703
|
llama4_rope,huggingface,full,memory,MB,T,sequence length,4096,596.03173828125,596.03173828125,596.03173828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
|
1704
1704
|
llama4_rope,huggingface,full,memory,MB,T,sequence length,8192,1160.06298828125,1160.06298828125,1160.06298828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
|
1705
1705
|
llama4_rope,huggingface,full,memory,MB,T,sequence length,16384,2288.12548828125,2288.12548828125,2288.12548828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
|
|
1706
|
+
tiled_geglu,liger,full,speed,ms,T,sequence length,1024,2.1678080558776855,2.166579246520996,2.1682305335998535,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
|
|
1707
|
+
tiled_geglu,liger,full,speed,ms,T,sequence length,2048,4.344256401062012,4.343987464904785,4.34452486038208,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
|
|
1708
|
+
tiled_geglu,liger,full,speed,ms,T,sequence length,4096,8.653023719787598,8.653023719787598,8.653023719787598,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
|
|
1709
|
+
tiled_geglu,liger,full,speed,ms,T,sequence length,8192,16.909311294555664,16.909311294555664,16.909311294555664,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
|
|
1710
|
+
tiled_geglu,liger,full,speed,ms,T,sequence length,16384,33.63123321533203,33.63123321533203,33.63123321533203,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
|
|
1711
|
+
tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,1024,3.353935956954956,3.353523015975952,3.35434889793396,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
|
|
1712
|
+
tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,2048,6.023168087005615,6.023168087005615,6.023168087005615,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
|
|
1713
|
+
tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,4096,11.495424270629883,11.495424270629883,11.495424270629883,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
|
|
1714
|
+
tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,8192,23.68614387512207,23.68614387512207,23.68614387512207,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
|
|
1715
|
+
tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,16384,47.478782653808594,47.478782653808594,47.478782653808594,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
|
|
1716
|
+
tiled_geglu,liger,forward,speed,ms,T,sequence length,1024,0.6614400148391724,0.6594560146331787,0.6635519862174988,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
|
|
1717
|
+
tiled_geglu,liger,forward,speed,ms,T,sequence length,2048,1.3471999168395996,1.346560001373291,1.3475840091705322,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
|
|
1718
|
+
tiled_geglu,liger,forward,speed,ms,T,sequence length,4096,2.752511978149414,2.7261502742767334,2.7844607830047607,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
|
|
1719
|
+
tiled_geglu,liger,forward,speed,ms,T,sequence length,8192,5.433343887329102,5.433343887329102,5.433343887329102,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
|
|
1720
|
+
tiled_geglu,liger,forward,speed,ms,T,sequence length,16384,10.712063789367676,10.712063789367676,10.712063789367676,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
|
|
1721
|
+
tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,1024,0.7403519749641418,0.7402047514915466,0.7413759827613831,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
|
|
1722
|
+
tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,2048,1.3941760063171387,1.3895679712295532,1.398144006729126,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
|
|
1723
|
+
tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,4096,2.7586560249328613,2.7585408687591553,2.759884834289551,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
|
|
1724
|
+
tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,8192,5.789696216583252,5.789696216583252,5.789696216583252,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
|
|
1725
|
+
tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,16384,11.810815811157227,11.810815811157227,11.810815811157227,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
|
|
1726
|
+
tiled_geglu,liger,backward,speed,ms,T,sequence length,1024,1.491968035697937,1.4916608333587646,1.4940160512924194,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
|
|
1727
|
+
tiled_geglu,liger,backward,speed,ms,T,sequence length,2048,3.0185279846191406,3.0131328105926514,3.0555264949798584,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
|
|
1728
|
+
tiled_geglu,liger,backward,speed,ms,T,sequence length,4096,6.021120071411133,6.021120071411133,6.021120071411133,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
|
|
1729
|
+
tiled_geglu,liger,backward,speed,ms,T,sequence length,8192,11.512767791748047,11.512767791748047,11.512767791748047,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
|
|
1730
|
+
tiled_geglu,liger,backward,speed,ms,T,sequence length,16384,22.806528091430664,22.806528091430664,22.806528091430664,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
|
|
1731
|
+
tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,1024,2.6060800552368164,2.6053311824798584,2.607308864593506,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
|
|
1732
|
+
tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,2048,4.665375709533691,4.664742469787598,4.666009426116943,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
|
|
1733
|
+
tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,4096,8.71731185913086,8.71731185913086,8.71731185913086,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
|
|
1734
|
+
tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,8192,17.99782371520996,17.99782371520996,17.99782371520996,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
|
|
1735
|
+
tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,16384,35.64400100708008,35.64400100708008,35.64400100708008,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
|
|
1736
|
+
tiled_geglu,liger,full,memory,MB,T,sequence length,1024,232.25,232.25,232.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
|
|
1737
|
+
tiled_geglu,liger,full,memory,MB,T,sequence length,2048,336.25,336.25,336.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
|
|
1738
|
+
tiled_geglu,liger,full,memory,MB,T,sequence length,4096,544.25,544.25,544.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
|
|
1739
|
+
tiled_geglu,liger,full,memory,MB,T,sequence length,8192,960.25,960.25,960.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
|
|
1740
|
+
tiled_geglu,liger,full,memory,MB,T,sequence length,16384,1792.25,1792.25,1792.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
|
|
1741
|
+
tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,1024,186.25,186.25,186.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
|
|
1742
|
+
tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,2048,244.25,244.25,244.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
|
|
1743
|
+
tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,4096,360.25,360.25,360.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
|
|
1744
|
+
tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,8192,592.25,592.25,592.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
|
|
1745
|
+
tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,16384,1056.25,1056.25,1056.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
|
|
1746
|
+
tiled_geglu,liger,forward,memory,MB,T,sequence length,1024,128.25,128.25,128.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
|
|
1747
|
+
tiled_geglu,liger,forward,memory,MB,T,sequence length,2048,192.25,192.25,192.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
|
|
1748
|
+
tiled_geglu,liger,forward,memory,MB,T,sequence length,4096,320.25,320.25,320.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
|
|
1749
|
+
tiled_geglu,liger,forward,memory,MB,T,sequence length,8192,576.25,576.25,576.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
|
|
1750
|
+
tiled_geglu,liger,forward,memory,MB,T,sequence length,16384,1088.25,1088.25,1088.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
|
|
1751
|
+
tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,1024,92.25,92.25,92.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
|
|
1752
|
+
tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,2048,120.25,120.25,120.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
|
|
1753
|
+
tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,4096,176.25,176.25,176.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
|
|
1754
|
+
tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,8192,288.25,288.25,288.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
|
|
1755
|
+
tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,16384,512.25,512.25,512.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
|
|
1756
|
+
tiled_geglu,liger,backward,memory,MB,T,sequence length,1024,232.25,232.25,232.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
|
|
1757
|
+
tiled_geglu,liger,backward,memory,MB,T,sequence length,2048,336.25,336.25,336.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
|
|
1758
|
+
tiled_geglu,liger,backward,memory,MB,T,sequence length,4096,544.25,544.25,544.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
|
|
1759
|
+
tiled_geglu,liger,backward,memory,MB,T,sequence length,8192,960.25,960.25,960.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
|
|
1760
|
+
tiled_geglu,liger,backward,memory,MB,T,sequence length,16384,1792.25,1792.25,1792.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
|
|
1761
|
+
tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,1024,186.25,186.25,186.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
|
|
1762
|
+
tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,2048,244.25,244.25,244.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
|
|
1763
|
+
tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,4096,360.25,360.25,360.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
|
|
1764
|
+
tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,8192,592.25,592.25,592.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
|
|
1765
|
+
tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,16384,1056.25,1056.25,1056.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
|
|
1766
|
+
tiled_swiglu,liger,full,speed,ms,T,sequence length,1024,2.165760040283203,2.164659261703491,2.167193651199341,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
|
|
1767
|
+
tiled_swiglu,liger,full,speed,ms,T,sequence length,2048,4.371456146240234,4.368383884429932,4.374527931213379,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
|
|
1768
|
+
tiled_swiglu,liger,full,speed,ms,T,sequence length,4096,8.935423851013184,8.935423851013184,8.935423851013184,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
|
|
1769
|
+
tiled_swiglu,liger,full,speed,ms,T,sequence length,8192,17.078943252563477,17.078943252563477,17.078943252563477,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
|
|
1770
|
+
tiled_swiglu,liger,full,speed,ms,T,sequence length,16384,33.74857711791992,33.74857711791992,33.74857711791992,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
|
|
1771
|
+
tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,1024,3.3510398864746094,3.3507328033447266,3.3513472080230713,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
|
|
1772
|
+
tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,2048,6.023168087005615,6.023168087005615,6.023168087005615,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
|
|
1773
|
+
tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,4096,11.609087944030762,11.609087944030762,11.609087944030762,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
|
|
1774
|
+
tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,8192,23.8591365814209,23.8591365814209,23.8591365814209,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
|
|
1775
|
+
tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,16384,47.721473693847656,47.721473693847656,47.721473693847656,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
|
|
1776
|
+
tiled_swiglu,liger,forward,speed,ms,T,sequence length,1024,0.6594560146331787,0.6594560146331787,0.6604800224304199,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
|
|
1777
|
+
tiled_swiglu,liger,forward,speed,ms,T,sequence length,2048,1.3537280559539795,1.3527040481567383,1.3547519445419312,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
|
|
1778
|
+
tiled_swiglu,liger,forward,speed,ms,T,sequence length,4096,2.7152960300445557,2.715123176574707,2.7155072689056396,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
|
|
1779
|
+
tiled_swiglu,liger,forward,speed,ms,T,sequence length,8192,5.3361921310424805,5.3361921310424805,5.3361921310424805,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
|
|
1780
|
+
tiled_swiglu,liger,forward,speed,ms,T,sequence length,16384,10.870783805847168,10.870783805847168,10.870783805847168,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
|
|
1781
|
+
tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,1024,0.7395360469818115,0.7383040189743042,0.7413759827613831,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
|
|
1782
|
+
tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,2048,1.3965599536895752,1.387935996055603,1.4024640321731567,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
|
|
1783
|
+
tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,4096,2.7778561115264893,2.777395248413086,2.7780096530914307,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
|
|
1784
|
+
tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,8192,5.829631805419922,5.829631805419922,5.829631805419922,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
|
|
1785
|
+
tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,16384,11.841535568237305,11.841535568237305,11.841535568237305,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
|
|
1786
|
+
tiled_swiglu,liger,backward,speed,ms,T,sequence length,1024,1.4970879554748535,1.4961408376693726,1.4970879554748535,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
|
|
1787
|
+
tiled_swiglu,liger,backward,speed,ms,T,sequence length,2048,3.052351951599121,3.0518529415130615,3.0550782680511475,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
|
|
1788
|
+
tiled_swiglu,liger,backward,speed,ms,T,sequence length,4096,6.074687957763672,6.074687957763672,6.074687957763672,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
|
|
1789
|
+
tiled_swiglu,liger,backward,speed,ms,T,sequence length,8192,11.630592346191406,11.630592346191406,11.630592346191406,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
|
|
1790
|
+
tiled_swiglu,liger,backward,speed,ms,T,sequence length,16384,22.76793670654297,22.76793670654297,22.76793670654297,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
|
|
1791
|
+
tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,1024,2.6021440029144287,2.6000702381134033,2.6032767295837402,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
|
|
1792
|
+
tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,2048,4.641791820526123,4.641791820526123,4.641791820526123,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
|
|
1793
|
+
tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,4096,8.761343955993652,8.761343955993652,8.761343955993652,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
|
|
1794
|
+
tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,8192,17.966079711914062,17.966079711914062,17.966079711914062,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
|
|
1795
|
+
tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,16384,35.657344818115234,35.657344818115234,35.657344818115234,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
|
|
1796
|
+
tiled_swiglu,liger,full,memory,MB,T,sequence length,1024,232.25,232.25,232.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
|
|
1797
|
+
tiled_swiglu,liger,full,memory,MB,T,sequence length,2048,336.25,336.25,336.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
|
|
1798
|
+
tiled_swiglu,liger,full,memory,MB,T,sequence length,4096,544.25,544.25,544.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
|
|
1799
|
+
tiled_swiglu,liger,full,memory,MB,T,sequence length,8192,960.25,960.25,960.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
|
|
1800
|
+
tiled_swiglu,liger,full,memory,MB,T,sequence length,16384,1792.25,1792.25,1792.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
|
|
1801
|
+
tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,1024,186.25,186.25,186.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
|
|
1802
|
+
tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,2048,244.25,244.25,244.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
|
|
1803
|
+
tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,4096,360.25,360.25,360.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
|
|
1804
|
+
tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,8192,592.25,592.25,592.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
|
|
1805
|
+
tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,16384,1056.25,1056.25,1056.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
|
|
1806
|
+
tiled_swiglu,liger,forward,memory,MB,T,sequence length,1024,128.25,128.25,128.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
|
|
1807
|
+
tiled_swiglu,liger,forward,memory,MB,T,sequence length,2048,192.25,192.25,192.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
|
|
1808
|
+
tiled_swiglu,liger,forward,memory,MB,T,sequence length,4096,320.25,320.25,320.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
|
|
1809
|
+
tiled_swiglu,liger,forward,memory,MB,T,sequence length,8192,576.25,576.25,576.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
|
|
1810
|
+
tiled_swiglu,liger,forward,memory,MB,T,sequence length,16384,1088.25,1088.25,1088.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
|
|
1811
|
+
tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,1024,92.25,92.25,92.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
|
|
1812
|
+
tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,2048,120.25,120.25,120.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
|
|
1813
|
+
tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,4096,176.25,176.25,176.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
|
|
1814
|
+
tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,8192,288.25,288.25,288.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
|
|
1815
|
+
tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,16384,512.25,512.25,512.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
|
|
1816
|
+
tiled_swiglu,liger,backward,memory,MB,T,sequence length,1024,232.25,232.25,232.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
|
|
1817
|
+
tiled_swiglu,liger,backward,memory,MB,T,sequence length,2048,336.25,336.25,336.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
|
|
1818
|
+
tiled_swiglu,liger,backward,memory,MB,T,sequence length,4096,544.25,544.25,544.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
|
|
1819
|
+
tiled_swiglu,liger,backward,memory,MB,T,sequence length,8192,960.25,960.25,960.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
|
|
1820
|
+
tiled_swiglu,liger,backward,memory,MB,T,sequence length,16384,1792.25,1792.25,1792.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
|
|
1821
|
+
tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,1024,186.25,186.25,186.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
|
|
1822
|
+
tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,2048,244.25,244.25,244.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
|
|
1823
|
+
tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,4096,360.25,360.25,360.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
|
|
1824
|
+
tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,8192,592.25,592.25,592.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
|
|
1825
|
+
tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,16384,1056.25,1056.25,1056.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
|
|
1826
|
+
tiled_geglu,huggingface,full,speed,ms,T,sequence length,1024,2.3357439041137695,2.3357439041137695,2.3375871181488037,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
|
|
1827
|
+
tiled_geglu,huggingface,full,speed,ms,T,sequence length,2048,4.764671802520752,4.764671802520752,4.764671802520752,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
|
|
1828
|
+
tiled_geglu,huggingface,full,speed,ms,T,sequence length,4096,9.4236478805542,9.4236478805542,9.4236478805542,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
|
|
1829
|
+
tiled_geglu,huggingface,full,speed,ms,T,sequence length,8192,17.628543853759766,17.628543853759766,17.628543853759766,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
|
|
1830
|
+
tiled_geglu,huggingface,full,speed,ms,T,sequence length,16384,35.06790542602539,35.06790542602539,35.06790542602539,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
|
|
1831
|
+
tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,1024,3.418976068496704,3.4176511764526367,3.4203009605407715,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
|
|
1832
|
+
tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,2048,6.158143997192383,6.158143997192383,6.158143997192383,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
|
|
1833
|
+
tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,4096,11.934720039367676,11.934720039367676,11.934720039367676,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
|
|
1834
|
+
tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,8192,24.731647491455078,24.731647491455078,24.731647491455078,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
|
|
1835
|
+
tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,16384,49.46227264404297,49.46227264404297,49.46227264404297,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
|
|
1836
|
+
tiled_geglu,huggingface,forward,speed,ms,T,sequence length,1024,0.6743040084838867,0.6736640334129333,0.677068829536438,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
|
|
1837
|
+
tiled_geglu,huggingface,forward,speed,ms,T,sequence length,2048,1.418239951133728,1.418239951133728,1.421120047569275,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
|
|
1838
|
+
tiled_geglu,huggingface,forward,speed,ms,T,sequence length,4096,2.88972806930542,2.889113664627075,2.8909568786621094,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
|
|
1839
|
+
tiled_geglu,huggingface,forward,speed,ms,T,sequence length,8192,5.701375961303711,5.701375961303711,5.701375961303711,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
|
|
1840
|
+
tiled_geglu,huggingface,forward,speed,ms,T,sequence length,16384,11.276288032531738,11.276288032531738,11.276288032531738,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
|
|
1841
|
+
tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,1024,0.7433919906616211,0.7423999905586243,0.7444480061531067,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
|
|
1842
|
+
tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,2048,1.4137760400772095,1.4131200313568115,1.4152319431304932,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
|
|
1843
|
+
tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,4096,2.8241920471191406,2.823500871658325,2.8266496658325195,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
|
|
1844
|
+
tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,8192,6.087679862976074,6.087679862976074,6.087679862976074,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
|
|
1845
|
+
tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,16384,12.353535652160645,12.353535652160645,12.353535652160645,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
|
|
1846
|
+
tiled_geglu,huggingface,backward,speed,ms,T,sequence length,1024,1.5499199628829956,1.5489535331726074,1.5523840188980103,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
|
|
1847
|
+
tiled_geglu,huggingface,backward,speed,ms,T,sequence length,2048,3.171328067779541,3.169484853744507,3.173171281814575,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
|
|
1848
|
+
tiled_geglu,huggingface,backward,speed,ms,T,sequence length,4096,6.263807773590088,6.263807773590088,6.263807773590088,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
|
|
1849
|
+
tiled_geglu,huggingface,backward,speed,ms,T,sequence length,8192,12.046143531799316,12.046143531799316,12.046143531799316,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
|
|
1850
|
+
tiled_geglu,huggingface,backward,speed,ms,T,sequence length,16384,23.839744567871094,23.839744567871094,23.839744567871094,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
|
|
1851
|
+
tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,1024,2.6757121086120605,2.6755776405334473,2.676710367202759,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
|
|
1852
|
+
tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,2048,4.7329277992248535,4.7329277992248535,4.7329277992248535,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
|
|
1853
|
+
tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,4096,9.078783988952637,9.078783988952637,9.078783988952637,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
|
|
1854
|
+
tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,8192,18.63680076599121,18.63680076599121,18.63680076599121,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
|
|
1855
|
+
tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,16384,37.06163024902344,37.06163024902344,37.06163024902344,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
|
|
1856
|
+
tiled_geglu,huggingface,full,memory,MB,T,sequence length,1024,264.25,264.25,264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
|
|
1857
|
+
tiled_geglu,huggingface,full,memory,MB,T,sequence length,2048,400.25,400.25,400.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
|
|
1858
|
+
tiled_geglu,huggingface,full,memory,MB,T,sequence length,4096,688.25,688.25,688.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
|
|
1859
|
+
tiled_geglu,huggingface,full,memory,MB,T,sequence length,8192,1264.25,1264.25,1264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
|
|
1860
|
+
tiled_geglu,huggingface,full,memory,MB,T,sequence length,16384,2416.25,2416.25,2416.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
|
|
1861
|
+
tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,1024,190.25,190.25,190.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
|
|
1862
|
+
tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,2048,252.25,252.25,252.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
|
|
1863
|
+
tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,4096,376.25,376.25,376.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
|
|
1864
|
+
tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,8192,640.25,640.25,640.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
|
|
1865
|
+
tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,16384,1168.25,1168.25,1168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
|
|
1866
|
+
tiled_geglu,huggingface,forward,memory,MB,T,sequence length,1024,144.25,144.25,144.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
|
|
1867
|
+
tiled_geglu,huggingface,forward,memory,MB,T,sequence length,2048,224.25,224.25,224.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
|
|
1868
|
+
tiled_geglu,huggingface,forward,memory,MB,T,sequence length,4096,384.25,384.25,384.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
|
|
1869
|
+
tiled_geglu,huggingface,forward,memory,MB,T,sequence length,8192,704.25,704.25,704.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
|
|
1870
|
+
tiled_geglu,huggingface,forward,memory,MB,T,sequence length,16384,1344.25,1344.25,1344.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
|
|
1871
|
+
tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,1024,90.25,90.25,90.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
|
|
1872
|
+
tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,2048,116.25,116.25,116.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
|
|
1873
|
+
tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,4096,168.25,168.25,168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
|
|
1874
|
+
tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,8192,272.25,272.25,272.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
|
|
1875
|
+
tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,16384,480.25,480.25,480.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
|
|
1876
|
+
tiled_geglu,huggingface,backward,memory,MB,T,sequence length,1024,264.25,264.25,264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
|
|
1877
|
+
tiled_geglu,huggingface,backward,memory,MB,T,sequence length,2048,400.25,400.25,400.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
|
|
1878
|
+
tiled_geglu,huggingface,backward,memory,MB,T,sequence length,4096,688.25,688.25,688.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
|
|
1879
|
+
tiled_geglu,huggingface,backward,memory,MB,T,sequence length,8192,1264.25,1264.25,1264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
|
|
1880
|
+
tiled_geglu,huggingface,backward,memory,MB,T,sequence length,16384,2416.25,2416.25,2416.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
|
|
1881
|
+
tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,1024,190.25,190.25,190.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
|
|
1882
|
+
tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,2048,252.25,252.25,252.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
|
|
1883
|
+
tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,4096,376.25,376.25,376.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
|
|
1884
|
+
tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,8192,640.25,640.25,640.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
|
|
1885
|
+
tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,16384,1168.25,1168.25,1168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
|
|
1886
|
+
tiled_swiglu,huggingface,full,speed,ms,T,sequence length,1024,2.2517759799957275,2.2517759799957275,2.254848003387451,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
|
|
1887
|
+
tiled_swiglu,huggingface,full,speed,ms,T,sequence length,2048,4.588511943817139,4.587302207946777,4.5897216796875,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
|
|
1888
|
+
tiled_swiglu,huggingface,full,speed,ms,T,sequence length,4096,9.233407974243164,9.233407974243164,9.233407974243164,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
|
|
1889
|
+
tiled_swiglu,huggingface,full,speed,ms,T,sequence length,8192,17.869823455810547,17.869823455810547,17.869823455810547,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
|
|
1890
|
+
tiled_swiglu,huggingface,full,speed,ms,T,sequence length,16384,35.34422302246094,35.34422302246094,35.34422302246094,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
|
|
1891
|
+
tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,1024,3.4257922172546387,3.424870491027832,3.426713705062866,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
|
|
1892
|
+
tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,2048,6.155263900756836,6.155263900756836,6.155263900756836,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
|
|
1893
|
+
tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,4096,11.92959976196289,11.92959976196289,11.92959976196289,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
|
|
1894
|
+
tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,8192,24.815616607666016,24.815616607666016,24.815616607666016,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
|
|
1895
|
+
tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,16384,49.62918472290039,49.62918472290039,49.62918472290039,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
|
|
1896
|
+
tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,1024,0.6748160123825073,0.6737920045852661,0.6758400201797485,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
|
|
1897
|
+
tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,2048,1.4332799911499023,1.4325759410858154,1.4335999488830566,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
|
|
1898
|
+
tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,4096,2.91212797164917,2.904217481613159,2.9146623611450195,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
|
|
1899
|
+
tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,8192,5.658976078033447,5.658976078033447,5.658976078033447,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
|
|
1900
|
+
tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,16384,11.341952323913574,11.341952323913574,11.341952323913574,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
|
|
1901
|
+
tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,1024,0.7454720139503479,0.7429631948471069,0.7456768155097961,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
|
|
1902
|
+
tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,2048,1.4120960235595703,1.410048007965088,1.4120960235595703,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
|
|
1903
|
+
tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,4096,2.825216054916382,2.825216054916382,2.8264448642730713,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
|
|
1904
|
+
tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,8192,6.077439785003662,6.077439785003662,6.077439785003662,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
|
|
1905
|
+
tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,16384,12.356608390808105,12.356608390808105,12.356608390808105,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
|
|
1906
|
+
tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,1024,1.551360011100769,1.5511807203292847,1.5532032251358032,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
|
|
1907
|
+
tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,2048,3.1928319931030273,3.1885311603546143,3.1971328258514404,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
|
|
1908
|
+
tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,4096,6.273248195648193,6.273248195648193,6.273248195648193,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
|
|
1909
|
+
tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,8192,12.058752059936523,12.058752059936523,12.058752059936523,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
|
|
1910
|
+
tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,16384,23.853055953979492,23.853055953979492,23.853055953979492,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
|
|
1911
|
+
tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,1024,2.6746881008148193,2.6728639602661133,2.6789886951446533,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
|
|
1912
|
+
tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,2048,4.739071846008301,4.739071846008301,4.739071846008301,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
|
|
1913
|
+
tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,4096,9.084927558898926,9.084927558898926,9.084927558898926,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
|
|
1914
|
+
tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,8192,18.729759216308594,18.729759216308594,18.729759216308594,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
|
|
1915
|
+
tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,16384,37.13724899291992,37.13724899291992,37.13724899291992,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
|
|
1916
|
+
tiled_swiglu,huggingface,full,memory,MB,T,sequence length,1024,264.25,264.25,264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
|
|
1917
|
+
tiled_swiglu,huggingface,full,memory,MB,T,sequence length,2048,400.25,400.25,400.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
|
|
1918
|
+
tiled_swiglu,huggingface,full,memory,MB,T,sequence length,4096,688.25,688.25,688.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
|
|
1919
|
+
tiled_swiglu,huggingface,full,memory,MB,T,sequence length,8192,1264.25,1264.25,1264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
|
|
1920
|
+
tiled_swiglu,huggingface,full,memory,MB,T,sequence length,16384,2416.25,2416.25,2416.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
|
|
1921
|
+
tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,1024,190.25,190.25,190.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
|
|
1922
|
+
tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,2048,252.25,252.25,252.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
|
|
1923
|
+
tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,4096,376.25,376.25,376.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
|
|
1924
|
+
tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,8192,640.25,640.25,640.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
|
|
1925
|
+
tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,16384,1168.25,1168.25,1168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
|
|
1926
|
+
tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,1024,144.25,144.25,144.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
|
|
1927
|
+
tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,2048,224.25,224.25,224.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
|
|
1928
|
+
tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,4096,384.25,384.25,384.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
|
|
1929
|
+
tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,8192,704.25,704.25,704.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
|
|
1930
|
+
tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,16384,1344.25,1344.25,1344.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
|
|
1931
|
+
tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,1024,90.25,90.25,90.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
|
|
1932
|
+
tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,2048,116.25,116.25,116.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
|
|
1933
|
+
tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,4096,168.25,168.25,168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
|
|
1934
|
+
tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,8192,272.25,272.25,272.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
|
|
1935
|
+
tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,16384,480.25,480.25,480.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
|
|
1936
|
+
tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,1024,264.25,264.25,264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
|
|
1937
|
+
tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,2048,400.25,400.25,400.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
|
|
1938
|
+
tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,4096,688.25,688.25,688.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
|
|
1939
|
+
tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,8192,1264.25,1264.25,1264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
|
|
1940
|
+
tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,16384,2416.25,2416.25,2416.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
|
|
1941
|
+
tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,1024,190.25,190.25,190.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3
|
|
1942
|
+
tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,2048,252.25,252.25,252.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3
|
|
1943
|
+
tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,4096,376.25,376.25,376.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3
|
|
1944
|
+
tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,8192,640.25,640.25,640.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3
|
|
1945
|
+
tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,16384,1168.25,1168.25,1168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3
|