liger-kernel-nightly 0.6.2.dev20251013144132__tar.gz → 0.6.3.dev20251121010234__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (306) hide show
  1. liger_kernel_nightly-0.6.3.dev20251121010234/.github/workflows/docs.yml +64 -0
  2. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/PKG-INFO +1 -1
  3. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/data/all_benchmark_data.csv +240 -0
  4. liger_kernel_nightly-0.6.3.dev20251121010234/benchmark/scripts/benchmark_poly_norm.py +197 -0
  5. liger_kernel_nightly-0.6.3.dev20251121010234/benchmark/scripts/benchmark_tiled_mlp.py +397 -0
  6. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/acknowledgement.md +0 -1
  7. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/pyproject.toml +1 -1
  8. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/cosine_similarity_loss.py +13 -4
  9. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/fused_linear_distillation.py +13 -2
  10. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/jsd_loss.py +18 -5
  11. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/cross_entropy.py +63 -10
  12. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/fused_linear_cross_entropy.py +43 -13
  13. liger_kernel_nightly-0.6.3.dev20251121010234/src/liger_kernel/ops/poly_norm.py +386 -0
  14. liger_kernel_nightly-0.6.3.dev20251121010234/src/liger_kernel/ops/tiled_mlp.py +136 -0
  15. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/__init__.py +18 -0
  16. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/cross_entropy.py +8 -3
  17. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/functional.py +29 -6
  18. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +8 -3
  19. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/falcon_h1.py +19 -5
  20. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/gemma.py +17 -6
  21. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/gemma2.py +14 -5
  22. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/gemma3.py +25 -12
  23. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/glm4.py +16 -4
  24. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/glm4v.py +16 -4
  25. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/glm4v_moe.py +23 -4
  26. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/internvl.py +12 -5
  27. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/llama.py +14 -5
  28. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/llama4.py +16 -4
  29. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/llava.py +12 -4
  30. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/loss_utils.py +31 -3
  31. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/mistral.py +15 -6
  32. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/mixtral.py +16 -7
  33. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/mllama.py +12 -4
  34. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/olmo2.py +16 -4
  35. liger_kernel_nightly-0.6.3.dev20251121010234/src/liger_kernel/transformers/model/output_classes.py +147 -0
  36. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/paligemma.py +22 -5
  37. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/phi3.py +14 -7
  38. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/qwen2.py +16 -3
  39. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/qwen2_5_vl.py +14 -6
  40. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/qwen2_vl.py +16 -4
  41. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/qwen3.py +20 -5
  42. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/qwen3_moe.py +19 -5
  43. liger_kernel_nightly-0.6.3.dev20251121010234/src/liger_kernel/transformers/model/qwen3_next.py +146 -0
  44. liger_kernel_nightly-0.6.3.dev20251121010234/src/liger_kernel/transformers/model/qwen3_vl.py +150 -0
  45. liger_kernel_nightly-0.6.3.dev20251121010234/src/liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
  46. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/smollm3.py +15 -6
  47. liger_kernel_nightly-0.6.3.dev20251121010234/src/liger_kernel/transformers/model/smolvlm.py +158 -0
  48. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/monkey_patch.py +401 -17
  49. liger_kernel_nightly-0.6.3.dev20251121010234/src/liger_kernel/transformers/poly_norm.py +42 -0
  50. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/rms_norm.py +7 -0
  51. liger_kernel_nightly-0.6.3.dev20251121010234/src/liger_kernel/transformers/rope.py +63 -0
  52. liger_kernel_nightly-0.6.3.dev20251121010234/src/liger_kernel/transformers/tiled_mlp.py +133 -0
  53. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel_nightly.egg-info/PKG-INFO +1 -1
  54. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel_nightly.egg-info/SOURCES.txt +15 -0
  55. liger_kernel_nightly-0.6.3.dev20251121010234/test/conftest.py +11 -0
  56. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/convergence/bf16/test_mini_models.py +259 -8
  57. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/convergence/bf16/test_mini_models_multimodal.py +314 -1
  58. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/convergence/bf16/test_mini_models_with_logits.py +256 -4
  59. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/convergence/fp32/test_mini_models.py +253 -4
  60. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/convergence/fp32/test_mini_models_multimodal.py +452 -1
  61. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/convergence/fp32/test_mini_models_with_logits.py +270 -1
  62. liger_kernel_nightly-0.6.3.dev20251121010234/test/resources/fake_configs/HuggingFaceTB/SmolVLM2-256M-Video-Instruct/tokenizer_config.json +1192 -0
  63. liger_kernel_nightly-0.6.3.dev20251121010234/test/resources/fake_configs/Qwen/Qwen3-VL-4B-Instruct/tokenizer_config.json +63 -0
  64. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_cross_entropy.py +81 -6
  65. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_fused_linear_cross_entropy.py +229 -5
  66. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_monkey_patch.py +795 -13
  67. liger_kernel_nightly-0.6.3.dev20251121010234/test/transformers/test_poly_norm.py +281 -0
  68. liger_kernel_nightly-0.6.3.dev20251121010234/test/transformers/test_tiled_mlp.py +216 -0
  69. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/utils.py +67 -0
  70. liger_kernel_nightly-0.6.2.dev20251013144132/.github/workflows/docs.yml +0 -33
  71. liger_kernel_nightly-0.6.2.dev20251013144132/src/liger_kernel/transformers/rope.py +0 -20
  72. liger_kernel_nightly-0.6.2.dev20251013144132/test/conftest.py +0 -8
  73. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/.github/ISSUE_TEMPLATE/bug_report.yaml +0 -0
  74. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  75. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/.github/pull_request_template.md +0 -0
  76. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/.github/workflows/amd-ci.yml +0 -0
  77. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/.github/workflows/benchmark.yml +0 -0
  78. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/.github/workflows/intel-ci.yml +0 -0
  79. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/.github/workflows/nvi-ci.yml +0 -0
  80. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/.github/workflows/publish-nightly.yml +0 -0
  81. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/.github/workflows/publish-release.yml +0 -0
  82. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/.gitignore +0 -0
  83. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/LICENSE +0 -0
  84. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/Makefile +0 -0
  85. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/NOTICE +0 -0
  86. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/README.md +0 -0
  87. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/README.md +0 -0
  88. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/__init__.py +0 -0
  89. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/benchmarks_visualizer.py +0 -0
  90. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/__init__.py +0 -0
  91. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_cpo_loss.py +0 -0
  92. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_cross_entropy.py +0 -0
  93. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_distill_cosine_loss.py +0 -0
  94. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_distill_jsd_loss.py +0 -0
  95. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_dpo_loss.py +0 -0
  96. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_dyt.py +0 -0
  97. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_embedding.py +0 -0
  98. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_fused_add_rms_norm.py +0 -0
  99. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py +0 -0
  100. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_fused_linear_jsd.py +0 -0
  101. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_fused_neighborhood_attention.py +0 -0
  102. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_geglu.py +0 -0
  103. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_group_norm.py +0 -0
  104. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_grpo_loss.py +0 -0
  105. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_jsd.py +0 -0
  106. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_kl_div.py +0 -0
  107. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_kto_loss.py +0 -0
  108. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_layer_norm.py +0 -0
  109. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_llama4_rope.py +0 -0
  110. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_multi_token_attention.py +0 -0
  111. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_orpo_loss.py +0 -0
  112. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_qwen2vl_mrope.py +0 -0
  113. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_rms_norm.py +0 -0
  114. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_rope.py +0 -0
  115. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_simpo_loss.py +0 -0
  116. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_softmax.py +0 -0
  117. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_sparse_multi_token_attention.py +0 -0
  118. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_sparsemax.py +0 -0
  119. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_swiglu.py +0 -0
  120. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/benchmark_tvd.py +0 -0
  121. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/benchmark/scripts/utils.py +0 -0
  122. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/dev/fmt-requirements.txt +0 -0
  123. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/dev/modal/benchmarks.py +0 -0
  124. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/dev/modal/tests.py +0 -0
  125. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/dev/modal/tests_bwd.py +0 -0
  126. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/Examples.md +0 -0
  127. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/Getting-Started.md +0 -0
  128. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/High-Level-APIs.md +0 -0
  129. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/Low-Level-APIs.md +0 -0
  130. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/contributing.md +0 -0
  131. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/images/banner.GIF +0 -0
  132. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/images/compose.gif +0 -0
  133. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/images/e2e-memory.png +0 -0
  134. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/images/e2e-tps.png +0 -0
  135. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/images/logo-banner.png +0 -0
  136. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/images/patch.gif +0 -0
  137. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/images/post-training.png +0 -0
  138. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/index.md +0 -0
  139. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/docs/license.md +0 -0
  140. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/alignment/accelerate_config.yaml +0 -0
  141. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/alignment/run_orpo.py +0 -0
  142. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/README.md +0 -0
  143. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/callback.py +0 -0
  144. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/config/fsdp_config.json +0 -0
  145. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/img/gemma_7b_mem.png +0 -0
  146. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/img/gemma_7b_tp.png +0 -0
  147. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/img/llama_mem_alloc.png +0 -0
  148. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/img/llama_tps.png +0 -0
  149. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/img/qwen_mem_alloc.png +0 -0
  150. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/img/qwen_tps.png +0 -0
  151. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/launch_on_modal.py +0 -0
  152. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/requirements.txt +0 -0
  153. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/run_benchmarks.sh +0 -0
  154. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/run_gemma.sh +0 -0
  155. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/run_llama.sh +0 -0
  156. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/run_qwen.sh +0 -0
  157. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/run_qwen2_vl.sh +0 -0
  158. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/training.py +0 -0
  159. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/huggingface/training_multimodal.py +0 -0
  160. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/lightning/README.md +0 -0
  161. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/lightning/requirements.txt +0 -0
  162. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/lightning/training.py +0 -0
  163. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/README.md +0 -0
  164. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/callback.py +0 -0
  165. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/docs/images/Memory_Stage1_num_head_3.png +0 -0
  166. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/docs/images/Memory_Stage1_num_head_5.png +0 -0
  167. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/docs/images/Memory_Stage2_num_head_3.png +0 -0
  168. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/docs/images/Memory_Stage2_num_head_5.png +0 -0
  169. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png +0 -0
  170. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png +0 -0
  171. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png +0 -0
  172. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png +0 -0
  173. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/fsdp/acc-fsdp.conf +0 -0
  174. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/medusa_util.py +0 -0
  175. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/requirements.txt +0 -0
  176. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/scripts/llama3_8b_medusa.sh +0 -0
  177. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/examples/medusa/train.py +0 -0
  178. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/licenses/LICENSE-Apache-2.0 +0 -0
  179. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/licenses/LICENSE-MIT-AutoAWQ +0 -0
  180. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/licenses/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
  181. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/licenses/LICENSE-MIT-llmc +0 -0
  182. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/licenses/LICENSE-MIT-triton +0 -0
  183. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/mkdocs.yml +0 -0
  184. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/setup.cfg +0 -0
  185. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/setup.py +0 -0
  186. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/__init__.py +0 -0
  187. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/README.md +0 -0
  188. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/__init__.py +0 -0
  189. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/cpo_loss.py +0 -0
  190. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/dpo_loss.py +0 -0
  191. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/functional.py +0 -0
  192. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/fused_linear_ppo.py +0 -0
  193. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/fused_linear_preference.py +0 -0
  194. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +0 -0
  195. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/grpo_loss.py +0 -0
  196. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/kto_loss.py +0 -0
  197. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/orpo_loss.py +0 -0
  198. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/chunked_loss/simpo_loss.py +0 -0
  199. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/env_report.py +0 -0
  200. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/__init__.py +0 -0
  201. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/dyt.py +0 -0
  202. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  203. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
  204. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/fused_add_rms_norm.py +0 -0
  205. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
  206. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/fused_neighborhood_attention.py +0 -0
  207. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/geglu.py +0 -0
  208. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/group_norm.py +0 -0
  209. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/grpo_loss.py +0 -0
  210. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/jsd.py +0 -0
  211. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/kl_div.py +0 -0
  212. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/layer_norm.py +0 -0
  213. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/llama4_rope.py +0 -0
  214. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/multi_token_attention.py +0 -0
  215. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/qwen2vl_mrope.py +0 -0
  216. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/rms_norm.py +0 -0
  217. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/rope.py +0 -0
  218. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/softmax.py +0 -0
  219. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/sparsemax.py +0 -0
  220. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/swiglu.py +0 -0
  221. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/tvd.py +0 -0
  222. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/ops/utils.py +0 -0
  223. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/auto_model.py +0 -0
  224. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/dyt.py +0 -0
  225. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/experimental/__init__.py +0 -0
  226. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  227. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/fsdp.py +0 -0
  228. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/fused_add_rms_norm.py +0 -0
  229. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
  230. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/fused_neighborhood_attention.py +0 -0
  231. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/geglu.py +0 -0
  232. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/group_norm.py +0 -0
  233. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/grpo_loss.py +0 -0
  234. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/jsd.py +0 -0
  235. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/kl_div.py +0 -0
  236. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/layer_norm.py +0 -0
  237. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/llama4_rope.py +0 -0
  238. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/model/__init__.py +0 -0
  239. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/multi_token_attention.py +0 -0
  240. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/qwen2vl_mrope.py +0 -0
  241. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/softmax.py +0 -0
  242. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/sparsemax.py +0 -0
  243. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/swiglu.py +0 -0
  244. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/trainer/__init__.py +0 -0
  245. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/trainer/orpo_trainer.py +0 -0
  246. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  247. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/transformers/tvd.py +0 -0
  248. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/triton/__init__.py +0 -0
  249. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/triton/monkey_patch.py +0 -0
  250. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel/utils.py +0 -0
  251. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
  252. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel_nightly.egg-info/requires.txt +0 -0
  253. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
  254. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/__init__.py +0 -0
  255. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/chunked_loss/__init__.py +0 -0
  256. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/chunked_loss/test_cosine_loss.py +0 -0
  257. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/chunked_loss/test_cpo_loss.py +0 -0
  258. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/chunked_loss/test_dpo_loss.py +0 -0
  259. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/chunked_loss/test_grpo_loss.py +0 -0
  260. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/chunked_loss/test_jsd_loss.py +0 -0
  261. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/chunked_loss/test_kto_loss.py +0 -0
  262. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/chunked_loss/test_orpo_loss.py +0 -0
  263. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/chunked_loss/test_simpo_loss.py +0 -0
  264. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/convergence/__init__.py +0 -0
  265. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/convergence/bf16/__init__.py +0 -0
  266. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/convergence/fp32/__init__.py +0 -0
  267. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json +0 -0
  268. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json +0 -0
  269. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json +0 -0
  270. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json +0 -0
  271. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json +0 -0
  272. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/fake_configs/OpenGVLab/InternVL3-1B-hf/tokenizer_config.json +0 -0
  273. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json +0 -0
  274. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json +0 -0
  275. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json +0 -0
  276. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json +0 -0
  277. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/scripts/generate_tokenized_dataset.py +0 -0
  278. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/tiny_shakespeare.txt +0 -0
  279. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow +0 -0
  280. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/tiny_shakespeare_tokenized/dataset_info.json +0 -0
  281. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/resources/tiny_shakespeare_tokenized/state.json +0 -0
  282. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_auto_model.py +0 -0
  283. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_dyt.py +0 -0
  284. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_embedding.py +0 -0
  285. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_flex_attention.py +0 -0
  286. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_fused_add_rms_norm.py +0 -0
  287. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_fused_linear_jsd.py +0 -0
  288. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_fused_neighborhood_attention.py +0 -0
  289. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_geglu.py +0 -0
  290. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_group_norm.py +0 -0
  291. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_grpo_loss.py +0 -0
  292. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_jsd.py +0 -0
  293. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_kl_div.py +0 -0
  294. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_layer_norm.py +0 -0
  295. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_mm_int8int2.py +0 -0
  296. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_multi_token_attention.py +0 -0
  297. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_qwen2vl_mrope.py +0 -0
  298. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_rms_norm.py +0 -0
  299. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_rope.py +0 -0
  300. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_softmax.py +0 -0
  301. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_sparsemax.py +0 -0
  302. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_swiglu.py +0 -0
  303. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_trainer_integration.py +0 -0
  304. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_transformers.py +0 -0
  305. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/transformers/test_tvd.py +0 -0
  306. {liger_kernel_nightly-0.6.2.dev20251013144132 → liger_kernel_nightly-0.6.3.dev20251121010234}/test/triton/test_triton_monkey_patch.py +0 -0
@@ -0,0 +1,64 @@
1
+ name: Publish documentation
2
+ on:
3
+ push:
4
+ branches:
5
+ - main
6
+ paths:
7
+ - 'docs/**'
8
+ - 'mkdocs.yml'
9
+
10
+ permissions:
11
+ contents: write
12
+ jobs:
13
+ deploy:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - name: Configure Git Credentials
18
+ run: |
19
+ git config user.name github-actions[bot]
20
+ git config user.email 41898282+github-actions[bot]@users.noreply.github.com
21
+ - uses: actions/setup-python@v5
22
+ with:
23
+ python-version: 3.x
24
+ - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
25
+ - uses: actions/cache@v4
26
+ with:
27
+ key: mkdocs-material-${{ env.cache_id }}
28
+ path: .cache
29
+ restore-keys: |
30
+ mkdocs-material-
31
+ - run: pip install mkdocs-material mkdocstrings[python]
32
+ # ====== Backup the benchmarks from gh-pages ======
33
+ # This is necessary because the benchmarks are not included in the documentation build process.
34
+ # So we need to backup the benchmarks from gh-pages and restore them after the documentation is built.
35
+ - name: Backup benchmarks from gh-pages
36
+ run: |
37
+ git fetch origin gh-pages
38
+ # create worktree bound to local gh-pages, tracking origin/gh-pages
39
+ git branch -f gh-pages origin/gh-pages || true
40
+ mkdir -p ghp && git worktree add ghp gh-pages || true
41
+ if [ -d ghp/benchmarks ]; then
42
+ tar -C ghp -czf /tmp/benchmarks.tgz benchmarks
43
+ fi
44
+ # IMPORTANT: remove worktree so gh-pages isn't checked out anywhere
45
+ git worktree remove ghp --force || true
46
+ echo "Backed up benchmarks from gh-pages"
47
+ # ====== Deploy the documentation ======
48
+ - name: Deploy documentation
49
+ run: mkdocs gh-deploy --force
50
+ # ====== Restore the benchmarks onto gh-pages ======
51
+ # This is necessary because the benchmarks are not included in the documentation build process.
52
+ # So we need to restore the benchmarks onto gh-pages after the documentation is built.
53
+ - name: Restore benchmarks onto gh-pages
54
+ run: |
55
+ # Refresh remote tracking and recreate a clean worktree
56
+ git fetch origin gh-pages
57
+ git worktree add -B gh-pages ghp origin/gh-pages
58
+ if [ -f /tmp/benchmarks.tgz ]; then
59
+ tar -C ghp -xzf /tmp/benchmarks.tgz
60
+ git -C ghp add -A
61
+ git -C ghp commit -m "Restore benchmarks after gh-deploy" || echo "No changes"
62
+ git -C ghp push origin gh-pages
63
+ fi
64
+ git worktree remove ghp --force || true
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.6.2.dev20251013144132
3
+ Version: 0.6.3.dev20251121010234
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -1703,3 +1703,243 @@ llama4_rope,huggingface,full,memory,MB,T,sequence length,2048,314.01611328125,31
1703
1703
  llama4_rope,huggingface,full,memory,MB,T,sequence length,4096,596.03173828125,596.03173828125,596.03173828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1704
1704
  llama4_rope,huggingface,full,memory,MB,T,sequence length,8192,1160.06298828125,1160.06298828125,1160.06298828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1705
1705
  llama4_rope,huggingface,full,memory,MB,T,sequence length,16384,2288.12548828125,2288.12548828125,2288.12548828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
1706
+ tiled_geglu,liger,full,speed,ms,T,sequence length,1024,2.1678080558776855,2.166579246520996,2.1682305335998535,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
1707
+ tiled_geglu,liger,full,speed,ms,T,sequence length,2048,4.344256401062012,4.343987464904785,4.34452486038208,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
1708
+ tiled_geglu,liger,full,speed,ms,T,sequence length,4096,8.653023719787598,8.653023719787598,8.653023719787598,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
1709
+ tiled_geglu,liger,full,speed,ms,T,sequence length,8192,16.909311294555664,16.909311294555664,16.909311294555664,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
1710
+ tiled_geglu,liger,full,speed,ms,T,sequence length,16384,33.63123321533203,33.63123321533203,33.63123321533203,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
1711
+ tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,1024,3.353935956954956,3.353523015975952,3.35434889793396,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
1712
+ tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,2048,6.023168087005615,6.023168087005615,6.023168087005615,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
1713
+ tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,4096,11.495424270629883,11.495424270629883,11.495424270629883,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
1714
+ tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,8192,23.68614387512207,23.68614387512207,23.68614387512207,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
1715
+ tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,16384,47.478782653808594,47.478782653808594,47.478782653808594,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
1716
+ tiled_geglu,liger,forward,speed,ms,T,sequence length,1024,0.6614400148391724,0.6594560146331787,0.6635519862174988,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
1717
+ tiled_geglu,liger,forward,speed,ms,T,sequence length,2048,1.3471999168395996,1.346560001373291,1.3475840091705322,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
1718
+ tiled_geglu,liger,forward,speed,ms,T,sequence length,4096,2.752511978149414,2.7261502742767334,2.7844607830047607,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
1719
+ tiled_geglu,liger,forward,speed,ms,T,sequence length,8192,5.433343887329102,5.433343887329102,5.433343887329102,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
1720
+ tiled_geglu,liger,forward,speed,ms,T,sequence length,16384,10.712063789367676,10.712063789367676,10.712063789367676,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
1721
+ tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,1024,0.7403519749641418,0.7402047514915466,0.7413759827613831,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
1722
+ tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,2048,1.3941760063171387,1.3895679712295532,1.398144006729126,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
1723
+ tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,4096,2.7586560249328613,2.7585408687591553,2.759884834289551,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
1724
+ tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,8192,5.789696216583252,5.789696216583252,5.789696216583252,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
1725
+ tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,16384,11.810815811157227,11.810815811157227,11.810815811157227,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
1726
+ tiled_geglu,liger,backward,speed,ms,T,sequence length,1024,1.491968035697937,1.4916608333587646,1.4940160512924194,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
1727
+ tiled_geglu,liger,backward,speed,ms,T,sequence length,2048,3.0185279846191406,3.0131328105926514,3.0555264949798584,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
1728
+ tiled_geglu,liger,backward,speed,ms,T,sequence length,4096,6.021120071411133,6.021120071411133,6.021120071411133,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
1729
+ tiled_geglu,liger,backward,speed,ms,T,sequence length,8192,11.512767791748047,11.512767791748047,11.512767791748047,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
1730
+ tiled_geglu,liger,backward,speed,ms,T,sequence length,16384,22.806528091430664,22.806528091430664,22.806528091430664,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
1731
+ tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,1024,2.6060800552368164,2.6053311824798584,2.607308864593506,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
1732
+ tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,2048,4.665375709533691,4.664742469787598,4.666009426116943,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
1733
+ tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,4096,8.71731185913086,8.71731185913086,8.71731185913086,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
1734
+ tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,8192,17.99782371520996,17.99782371520996,17.99782371520996,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
1735
+ tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,16384,35.64400100708008,35.64400100708008,35.64400100708008,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
1736
+ tiled_geglu,liger,full,memory,MB,T,sequence length,1024,232.25,232.25,232.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
1737
+ tiled_geglu,liger,full,memory,MB,T,sequence length,2048,336.25,336.25,336.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
1738
+ tiled_geglu,liger,full,memory,MB,T,sequence length,4096,544.25,544.25,544.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
1739
+ tiled_geglu,liger,full,memory,MB,T,sequence length,8192,960.25,960.25,960.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
1740
+ tiled_geglu,liger,full,memory,MB,T,sequence length,16384,1792.25,1792.25,1792.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
1741
+ tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,1024,186.25,186.25,186.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
1742
+ tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,2048,244.25,244.25,244.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
1743
+ tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,4096,360.25,360.25,360.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
1744
+ tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,8192,592.25,592.25,592.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
1745
+ tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,16384,1056.25,1056.25,1056.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
1746
+ tiled_geglu,liger,forward,memory,MB,T,sequence length,1024,128.25,128.25,128.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
1747
+ tiled_geglu,liger,forward,memory,MB,T,sequence length,2048,192.25,192.25,192.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
1748
+ tiled_geglu,liger,forward,memory,MB,T,sequence length,4096,320.25,320.25,320.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
1749
+ tiled_geglu,liger,forward,memory,MB,T,sequence length,8192,576.25,576.25,576.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
1750
+ tiled_geglu,liger,forward,memory,MB,T,sequence length,16384,1088.25,1088.25,1088.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
1751
+ tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,1024,92.25,92.25,92.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
1752
+ tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,2048,120.25,120.25,120.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
1753
+ tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,4096,176.25,176.25,176.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
1754
+ tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,8192,288.25,288.25,288.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
1755
+ tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,16384,512.25,512.25,512.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
1756
+ tiled_geglu,liger,backward,memory,MB,T,sequence length,1024,232.25,232.25,232.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
1757
+ tiled_geglu,liger,backward,memory,MB,T,sequence length,2048,336.25,336.25,336.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
1758
+ tiled_geglu,liger,backward,memory,MB,T,sequence length,4096,544.25,544.25,544.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
1759
+ tiled_geglu,liger,backward,memory,MB,T,sequence length,8192,960.25,960.25,960.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
1760
+ tiled_geglu,liger,backward,memory,MB,T,sequence length,16384,1792.25,1792.25,1792.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
1761
+ tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,1024,186.25,186.25,186.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
1762
+ tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,2048,244.25,244.25,244.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
1763
+ tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,4096,360.25,360.25,360.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
1764
+ tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,8192,592.25,592.25,592.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
1765
+ tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,16384,1056.25,1056.25,1056.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
1766
+ tiled_swiglu,liger,full,speed,ms,T,sequence length,1024,2.165760040283203,2.164659261703491,2.167193651199341,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
1767
+ tiled_swiglu,liger,full,speed,ms,T,sequence length,2048,4.371456146240234,4.368383884429932,4.374527931213379,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
1768
+ tiled_swiglu,liger,full,speed,ms,T,sequence length,4096,8.935423851013184,8.935423851013184,8.935423851013184,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
1769
+ tiled_swiglu,liger,full,speed,ms,T,sequence length,8192,17.078943252563477,17.078943252563477,17.078943252563477,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
1770
+ tiled_swiglu,liger,full,speed,ms,T,sequence length,16384,33.74857711791992,33.74857711791992,33.74857711791992,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
1771
+ tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,1024,3.3510398864746094,3.3507328033447266,3.3513472080230713,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
1772
+ tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,2048,6.023168087005615,6.023168087005615,6.023168087005615,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
1773
+ tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,4096,11.609087944030762,11.609087944030762,11.609087944030762,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
1774
+ tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,8192,23.8591365814209,23.8591365814209,23.8591365814209,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
1775
+ tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,16384,47.721473693847656,47.721473693847656,47.721473693847656,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
1776
+ tiled_swiglu,liger,forward,speed,ms,T,sequence length,1024,0.6594560146331787,0.6594560146331787,0.6604800224304199,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
1777
+ tiled_swiglu,liger,forward,speed,ms,T,sequence length,2048,1.3537280559539795,1.3527040481567383,1.3547519445419312,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
1778
+ tiled_swiglu,liger,forward,speed,ms,T,sequence length,4096,2.7152960300445557,2.715123176574707,2.7155072689056396,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
1779
+ tiled_swiglu,liger,forward,speed,ms,T,sequence length,8192,5.3361921310424805,5.3361921310424805,5.3361921310424805,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
1780
+ tiled_swiglu,liger,forward,speed,ms,T,sequence length,16384,10.870783805847168,10.870783805847168,10.870783805847168,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
1781
+ tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,1024,0.7395360469818115,0.7383040189743042,0.7413759827613831,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
1782
+ tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,2048,1.3965599536895752,1.387935996055603,1.4024640321731567,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
1783
+ tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,4096,2.7778561115264893,2.777395248413086,2.7780096530914307,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
1784
+ tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,8192,5.829631805419922,5.829631805419922,5.829631805419922,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
1785
+ tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,16384,11.841535568237305,11.841535568237305,11.841535568237305,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
1786
+ tiled_swiglu,liger,backward,speed,ms,T,sequence length,1024,1.4970879554748535,1.4961408376693726,1.4970879554748535,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
1787
+ tiled_swiglu,liger,backward,speed,ms,T,sequence length,2048,3.052351951599121,3.0518529415130615,3.0550782680511475,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
1788
+ tiled_swiglu,liger,backward,speed,ms,T,sequence length,4096,6.074687957763672,6.074687957763672,6.074687957763672,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
1789
+ tiled_swiglu,liger,backward,speed,ms,T,sequence length,8192,11.630592346191406,11.630592346191406,11.630592346191406,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
1790
+ tiled_swiglu,liger,backward,speed,ms,T,sequence length,16384,22.76793670654297,22.76793670654297,22.76793670654297,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
1791
+ tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,1024,2.6021440029144287,2.6000702381134033,2.6032767295837402,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
1792
+ tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,2048,4.641791820526123,4.641791820526123,4.641791820526123,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
1793
+ tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,4096,8.761343955993652,8.761343955993652,8.761343955993652,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
1794
+ tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,8192,17.966079711914062,17.966079711914062,17.966079711914062,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
1795
+ tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,16384,35.657344818115234,35.657344818115234,35.657344818115234,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
1796
+ tiled_swiglu,liger,full,memory,MB,T,sequence length,1024,232.25,232.25,232.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
1797
+ tiled_swiglu,liger,full,memory,MB,T,sequence length,2048,336.25,336.25,336.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
1798
+ tiled_swiglu,liger,full,memory,MB,T,sequence length,4096,544.25,544.25,544.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
1799
+ tiled_swiglu,liger,full,memory,MB,T,sequence length,8192,960.25,960.25,960.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
1800
+ tiled_swiglu,liger,full,memory,MB,T,sequence length,16384,1792.25,1792.25,1792.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
1801
+ tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,1024,186.25,186.25,186.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
1802
+ tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,2048,244.25,244.25,244.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
1803
+ tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,4096,360.25,360.25,360.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
1804
+ tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,8192,592.25,592.25,592.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
1805
+ tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,16384,1056.25,1056.25,1056.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
1806
+ tiled_swiglu,liger,forward,memory,MB,T,sequence length,1024,128.25,128.25,128.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
1807
+ tiled_swiglu,liger,forward,memory,MB,T,sequence length,2048,192.25,192.25,192.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
1808
+ tiled_swiglu,liger,forward,memory,MB,T,sequence length,4096,320.25,320.25,320.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
1809
+ tiled_swiglu,liger,forward,memory,MB,T,sequence length,8192,576.25,576.25,576.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
1810
+ tiled_swiglu,liger,forward,memory,MB,T,sequence length,16384,1088.25,1088.25,1088.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
1811
+ tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,1024,92.25,92.25,92.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
1812
+ tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,2048,120.25,120.25,120.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
1813
+ tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,4096,176.25,176.25,176.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
1814
+ tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,8192,288.25,288.25,288.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
1815
+ tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,16384,512.25,512.25,512.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
1816
+ tiled_swiglu,liger,backward,memory,MB,T,sequence length,1024,232.25,232.25,232.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
1817
+ tiled_swiglu,liger,backward,memory,MB,T,sequence length,2048,336.25,336.25,336.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
1818
+ tiled_swiglu,liger,backward,memory,MB,T,sequence length,4096,544.25,544.25,544.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
1819
+ tiled_swiglu,liger,backward,memory,MB,T,sequence length,8192,960.25,960.25,960.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
1820
+ tiled_swiglu,liger,backward,memory,MB,T,sequence length,16384,1792.25,1792.25,1792.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
1821
+ tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,1024,186.25,186.25,186.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
1822
+ tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,2048,244.25,244.25,244.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
1823
+ tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,4096,360.25,360.25,360.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
1824
+ tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,8192,592.25,592.25,592.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
1825
+ tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,16384,1056.25,1056.25,1056.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
1826
+ tiled_geglu,huggingface,full,speed,ms,T,sequence length,1024,2.3357439041137695,2.3357439041137695,2.3375871181488037,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
1827
+ tiled_geglu,huggingface,full,speed,ms,T,sequence length,2048,4.764671802520752,4.764671802520752,4.764671802520752,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
1828
+ tiled_geglu,huggingface,full,speed,ms,T,sequence length,4096,9.4236478805542,9.4236478805542,9.4236478805542,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
1829
+ tiled_geglu,huggingface,full,speed,ms,T,sequence length,8192,17.628543853759766,17.628543853759766,17.628543853759766,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
1830
+ tiled_geglu,huggingface,full,speed,ms,T,sequence length,16384,35.06790542602539,35.06790542602539,35.06790542602539,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
1831
+ tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,1024,3.418976068496704,3.4176511764526367,3.4203009605407715,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
1832
+ tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,2048,6.158143997192383,6.158143997192383,6.158143997192383,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
1833
+ tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,4096,11.934720039367676,11.934720039367676,11.934720039367676,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
1834
+ tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,8192,24.731647491455078,24.731647491455078,24.731647491455078,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
1835
+ tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,16384,49.46227264404297,49.46227264404297,49.46227264404297,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
1836
+ tiled_geglu,huggingface,forward,speed,ms,T,sequence length,1024,0.6743040084838867,0.6736640334129333,0.677068829536438,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
1837
+ tiled_geglu,huggingface,forward,speed,ms,T,sequence length,2048,1.418239951133728,1.418239951133728,1.421120047569275,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
1838
+ tiled_geglu,huggingface,forward,speed,ms,T,sequence length,4096,2.88972806930542,2.889113664627075,2.8909568786621094,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
1839
+ tiled_geglu,huggingface,forward,speed,ms,T,sequence length,8192,5.701375961303711,5.701375961303711,5.701375961303711,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
1840
+ tiled_geglu,huggingface,forward,speed,ms,T,sequence length,16384,11.276288032531738,11.276288032531738,11.276288032531738,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
1841
+ tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,1024,0.7433919906616211,0.7423999905586243,0.7444480061531067,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
1842
+ tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,2048,1.4137760400772095,1.4131200313568115,1.4152319431304932,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
1843
+ tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,4096,2.8241920471191406,2.823500871658325,2.8266496658325195,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
1844
+ tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,8192,6.087679862976074,6.087679862976074,6.087679862976074,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
1845
+ tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,16384,12.353535652160645,12.353535652160645,12.353535652160645,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
1846
+ tiled_geglu,huggingface,backward,speed,ms,T,sequence length,1024,1.5499199628829956,1.5489535331726074,1.5523840188980103,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
1847
+ tiled_geglu,huggingface,backward,speed,ms,T,sequence length,2048,3.171328067779541,3.169484853744507,3.173171281814575,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
1848
+ tiled_geglu,huggingface,backward,speed,ms,T,sequence length,4096,6.263807773590088,6.263807773590088,6.263807773590088,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
1849
+ tiled_geglu,huggingface,backward,speed,ms,T,sequence length,8192,12.046143531799316,12.046143531799316,12.046143531799316,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
1850
+ tiled_geglu,huggingface,backward,speed,ms,T,sequence length,16384,23.839744567871094,23.839744567871094,23.839744567871094,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
1851
+ tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,1024,2.6757121086120605,2.6755776405334473,2.676710367202759,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
1852
+ tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,2048,4.7329277992248535,4.7329277992248535,4.7329277992248535,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
1853
+ tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,4096,9.078783988952637,9.078783988952637,9.078783988952637,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
1854
+ tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,8192,18.63680076599121,18.63680076599121,18.63680076599121,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
1855
+ tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,16384,37.06163024902344,37.06163024902344,37.06163024902344,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
1856
+ tiled_geglu,huggingface,full,memory,MB,T,sequence length,1024,264.25,264.25,264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
1857
+ tiled_geglu,huggingface,full,memory,MB,T,sequence length,2048,400.25,400.25,400.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
1858
+ tiled_geglu,huggingface,full,memory,MB,T,sequence length,4096,688.25,688.25,688.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
1859
+ tiled_geglu,huggingface,full,memory,MB,T,sequence length,8192,1264.25,1264.25,1264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
1860
+ tiled_geglu,huggingface,full,memory,MB,T,sequence length,16384,2416.25,2416.25,2416.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
1861
+ tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,1024,190.25,190.25,190.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
1862
+ tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,2048,252.25,252.25,252.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
1863
+ tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,4096,376.25,376.25,376.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
1864
+ tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,8192,640.25,640.25,640.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
1865
+ tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,16384,1168.25,1168.25,1168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
1866
+ tiled_geglu,huggingface,forward,memory,MB,T,sequence length,1024,144.25,144.25,144.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
1867
+ tiled_geglu,huggingface,forward,memory,MB,T,sequence length,2048,224.25,224.25,224.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
1868
+ tiled_geglu,huggingface,forward,memory,MB,T,sequence length,4096,384.25,384.25,384.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
1869
+ tiled_geglu,huggingface,forward,memory,MB,T,sequence length,8192,704.25,704.25,704.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
1870
+ tiled_geglu,huggingface,forward,memory,MB,T,sequence length,16384,1344.25,1344.25,1344.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
1871
+ tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,1024,90.25,90.25,90.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
1872
+ tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,2048,116.25,116.25,116.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
1873
+ tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,4096,168.25,168.25,168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
1874
+ tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,8192,272.25,272.25,272.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
1875
+ tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,16384,480.25,480.25,480.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
1876
+ tiled_geglu,huggingface,backward,memory,MB,T,sequence length,1024,264.25,264.25,264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
1877
+ tiled_geglu,huggingface,backward,memory,MB,T,sequence length,2048,400.25,400.25,400.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
1878
+ tiled_geglu,huggingface,backward,memory,MB,T,sequence length,4096,688.25,688.25,688.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
1879
+ tiled_geglu,huggingface,backward,memory,MB,T,sequence length,8192,1264.25,1264.25,1264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
1880
+ tiled_geglu,huggingface,backward,memory,MB,T,sequence length,16384,2416.25,2416.25,2416.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
1881
+ tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,1024,190.25,190.25,190.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
1882
+ tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,2048,252.25,252.25,252.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
1883
+ tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,4096,376.25,376.25,376.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
1884
+ tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,8192,640.25,640.25,640.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
1885
+ tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,16384,1168.25,1168.25,1168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
1886
+ tiled_swiglu,huggingface,full,speed,ms,T,sequence length,1024,2.2517759799957275,2.2517759799957275,2.254848003387451,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
1887
+ tiled_swiglu,huggingface,full,speed,ms,T,sequence length,2048,4.588511943817139,4.587302207946777,4.5897216796875,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
1888
+ tiled_swiglu,huggingface,full,speed,ms,T,sequence length,4096,9.233407974243164,9.233407974243164,9.233407974243164,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
1889
+ tiled_swiglu,huggingface,full,speed,ms,T,sequence length,8192,17.869823455810547,17.869823455810547,17.869823455810547,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
1890
+ tiled_swiglu,huggingface,full,speed,ms,T,sequence length,16384,35.34422302246094,35.34422302246094,35.34422302246094,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
1891
+ tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,1024,3.4257922172546387,3.424870491027832,3.426713705062866,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
1892
+ tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,2048,6.155263900756836,6.155263900756836,6.155263900756836,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
1893
+ tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,4096,11.92959976196289,11.92959976196289,11.92959976196289,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
1894
+ tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,8192,24.815616607666016,24.815616607666016,24.815616607666016,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
1895
+ tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,16384,49.62918472290039,49.62918472290039,49.62918472290039,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
1896
+ tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,1024,0.6748160123825073,0.6737920045852661,0.6758400201797485,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
1897
+ tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,2048,1.4332799911499023,1.4325759410858154,1.4335999488830566,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
1898
+ tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,4096,2.91212797164917,2.904217481613159,2.9146623611450195,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
1899
+ tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,8192,5.658976078033447,5.658976078033447,5.658976078033447,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
1900
+ tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,16384,11.341952323913574,11.341952323913574,11.341952323913574,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
1901
+ tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,1024,0.7454720139503479,0.7429631948471069,0.7456768155097961,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
1902
+ tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,2048,1.4120960235595703,1.410048007965088,1.4120960235595703,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
1903
+ tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,4096,2.825216054916382,2.825216054916382,2.8264448642730713,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
1904
+ tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,8192,6.077439785003662,6.077439785003662,6.077439785003662,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
1905
+ tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,16384,12.356608390808105,12.356608390808105,12.356608390808105,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
1906
+ tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,1024,1.551360011100769,1.5511807203292847,1.5532032251358032,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
1907
+ tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,2048,3.1928319931030273,3.1885311603546143,3.1971328258514404,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
1908
+ tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,4096,6.273248195648193,6.273248195648193,6.273248195648193,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
1909
+ tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,8192,12.058752059936523,12.058752059936523,12.058752059936523,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
1910
+ tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,16384,23.853055953979492,23.853055953979492,23.853055953979492,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
1911
+ tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,1024,2.6746881008148193,2.6728639602661133,2.6789886951446533,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
1912
+ tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,2048,4.739071846008301,4.739071846008301,4.739071846008301,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
1913
+ tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,4096,9.084927558898926,9.084927558898926,9.084927558898926,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
1914
+ tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,8192,18.729759216308594,18.729759216308594,18.729759216308594,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
1915
+ tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,16384,37.13724899291992,37.13724899291992,37.13724899291992,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
1916
+ tiled_swiglu,huggingface,full,memory,MB,T,sequence length,1024,264.25,264.25,264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
1917
+ tiled_swiglu,huggingface,full,memory,MB,T,sequence length,2048,400.25,400.25,400.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
1918
+ tiled_swiglu,huggingface,full,memory,MB,T,sequence length,4096,688.25,688.25,688.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
1919
+ tiled_swiglu,huggingface,full,memory,MB,T,sequence length,8192,1264.25,1264.25,1264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
1920
+ tiled_swiglu,huggingface,full,memory,MB,T,sequence length,16384,2416.25,2416.25,2416.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
1921
+ tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,1024,190.25,190.25,190.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
1922
+ tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,2048,252.25,252.25,252.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
1923
+ tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,4096,376.25,376.25,376.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
1924
+ tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,8192,640.25,640.25,640.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
1925
+ tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,16384,1168.25,1168.25,1168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
1926
+ tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,1024,144.25,144.25,144.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
1927
+ tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,2048,224.25,224.25,224.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
1928
+ tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,4096,384.25,384.25,384.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
1929
+ tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,8192,704.25,704.25,704.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
1930
+ tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,16384,1344.25,1344.25,1344.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
1931
+ tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,1024,90.25,90.25,90.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
1932
+ tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,2048,116.25,116.25,116.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
1933
+ tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,4096,168.25,168.25,168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
1934
+ tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,8192,272.25,272.25,272.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
1935
+ tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,16384,480.25,480.25,480.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
1936
+ tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,1024,264.25,264.25,264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
1937
+ tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,2048,400.25,400.25,400.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
1938
+ tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,4096,688.25,688.25,688.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
1939
+ tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,8192,1264.25,1264.25,1264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
1940
+ tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,16384,2416.25,2416.25,2416.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
1941
+ tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,1024,190.25,190.25,190.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3
1942
+ tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,2048,252.25,252.25,252.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3
1943
+ tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,4096,376.25,376.25,376.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3
1944
+ tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,8192,640.25,640.25,640.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3
1945
+ tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,16384,1168.25,1168.25,1168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3