sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -20,10 +20,11 @@ import torch
20
20
  import torch.nn as nn
21
21
 
22
22
  from sglang.srt.custom_op import CustomOp
23
- from sglang.srt.utils import is_cuda, is_hip
23
+ from sglang.srt.utils import get_bool_env_var, is_cuda, is_hip
24
24
 
25
25
  _is_cuda = is_cuda()
26
26
  _is_hip = is_hip()
27
+ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
27
28
 
28
29
  if _is_cuda:
29
30
  from sgl_kernel import (
@@ -33,7 +34,10 @@ if _is_cuda:
33
34
  rmsnorm,
34
35
  )
35
36
 
36
- if _is_hip:
37
+ if _use_aiter:
38
+ from aiter import rmsnorm2d_fwd as rms_norm
39
+ from aiter import rmsnorm2d_fwd_with_add as fused_add_rms_norm
40
+ elif _is_hip:
37
41
  from vllm._custom_ops import fused_add_rms_norm, rms_norm
38
42
 
39
43
  logger = logging.getLogger(__name__)
@@ -48,16 +52,8 @@ class RMSNorm(CustomOp):
48
52
  super().__init__()
49
53
  self.weight = nn.Parameter(torch.ones(hidden_size))
50
54
  self.variance_epsilon = eps
51
-
52
- def forward(self, *args, **kwargs):
53
- if torch.compiler.is_compiling():
54
- return self.forward_native(*args, **kwargs)
55
- if _is_cuda:
56
- return self.forward_cuda(*args, **kwargs)
57
- elif _is_hip:
58
- return self.forward_hip(*args, **kwargs)
59
- else:
60
- return self.forward_native(*args, **kwargs)
55
+ if _use_aiter:
56
+ self._forward_method = self.forward_aiter
61
57
 
62
58
  def forward_cuda(
63
59
  self,
@@ -70,6 +66,25 @@ class RMSNorm(CustomOp):
70
66
  out = rmsnorm(x, self.weight.data, self.variance_epsilon)
71
67
  return out
72
68
 
69
+ def forward_aiter(
70
+ self,
71
+ x: torch.Tensor,
72
+ residual: Optional[torch.Tensor] = None,
73
+ ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
74
+ if residual is not None:
75
+ residual_out = torch.empty_like(x)
76
+ output = torch.empty_like(x)
77
+ fused_add_rms_norm(
78
+ output,
79
+ x,
80
+ residual,
81
+ residual_out,
82
+ self.weight.data,
83
+ self.variance_epsilon,
84
+ )
85
+ return output, residual_out
86
+ return rms_norm(x, self.weight.data, self.variance_epsilon)
87
+
73
88
  def forward_hip(
74
89
  self,
75
90
  x: torch.Tensor,
@@ -117,13 +132,9 @@ class GemmaRMSNorm(CustomOp):
117
132
  self.weight = nn.Parameter(torch.zeros(hidden_size))
118
133
  self.variance_epsilon = eps
119
134
 
120
- def forward(self, *args, **kwargs):
121
- if torch.compiler.is_compiling():
122
- return self.forward_native(*args, **kwargs)
123
- if _is_cuda:
124
- return self.forward_cuda(*args, **kwargs)
125
- else:
126
- return self.forward_native(*args, **kwargs)
135
+ # Re-dispatch
136
+ if _is_hip:
137
+ self._forward_method = self.forward_native
127
138
 
128
139
  def forward_native(
129
140
  self,
@@ -546,8 +546,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
546
546
  param.shard_id.append(loaded_shard_id)
547
547
  param.shard_id_map[loaded_shard_id] = len(param.data_container)
548
548
  param.data_container.append(loaded_weight)
549
- if len(param.data_container) == 2:
550
- self.qweight = param.materialize_nested()
551
549
  return
552
550
 
553
551
  param_data = param.data
@@ -961,8 +959,6 @@ class QKVParallelLinear(ColumnParallelLinear):
961
959
  param.shard_id.append(loaded_shard_id)
962
960
  param.shard_id_map[loaded_shard_id] = len(param.data_container)
963
961
  param.data_container.append(loaded_weight)
964
- if len(param.data_container) == 3:
965
- self.qweight = param.materialize_nested()
966
962
  return
967
963
 
968
964
  param_data = param.data
@@ -47,18 +47,6 @@ from sglang.srt.utils import dump_to_file
47
47
  logger = logging.getLogger(__name__)
48
48
 
49
49
 
50
- from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
51
- from sglang.srt.managers.schedule_batch import global_server_args_dict
52
- from sglang.srt.model_executor.forward_batch_info import (
53
- CaptureHiddenMode,
54
- ForwardBatch,
55
- ForwardMode,
56
- )
57
- from sglang.srt.utils import dump_to_file
58
-
59
- logger = logging.getLogger(__name__)
60
-
61
-
62
50
  @dataclasses.dataclass
63
51
  class LogitsProcessorOutput:
64
52
  ## Part 1: This part will be assigned in python/sglang/srt/layers/logits_processor.py::LogitsProcessor
@@ -1,4 +1,4 @@
1
- """Cutlass MoE kernel."""
1
+ """CUTLASS based Fused MoE kernels."""
2
2
 
3
3
  import functools
4
4
  import json
@@ -8,19 +8,24 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
8
8
 
9
9
  import torch
10
10
 
11
+ from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams
11
12
  from sglang.srt.utils import is_cuda
12
13
 
13
14
  _is_cuda = is_cuda()
14
15
  if _is_cuda:
15
16
  import sgl_kernel
16
17
  from sgl_kernel import (
18
+ apply_shuffle_mul_sum,
19
+ cutlass_fp4_group_mm,
17
20
  fp8_blockwise_scaled_grouped_mm,
18
21
  prepare_moe_input,
22
+ scaled_fp4_experts_quant,
23
+ shuffle_rows,
19
24
  silu_and_mul,
20
25
  )
21
26
 
22
27
 
23
- def cutlass_fused_experts(
28
+ def cutlass_fused_experts_fp8(
24
29
  a: torch.Tensor,
25
30
  w1_q: torch.Tensor,
26
31
  w2_q: torch.Tensor,
@@ -147,8 +152,8 @@ def cutlass_fused_experts(
147
152
  k,
148
153
  )
149
154
 
150
- rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
151
- rep_a1_scales = a1_scale[a_map]
155
+ rep_a_q = shuffle_rows(a_q, a_map, (m * topk, k))
156
+ rep_a1_scales = shuffle_rows(a1_scale, a_map, (m * topk, int(k / 128)))
152
157
 
153
158
  c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
154
159
  c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
@@ -202,6 +207,164 @@ def cutlass_fused_experts(
202
207
  expert_offsets[:-1],
203
208
  workspace,
204
209
  )
205
- return (
206
- c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype)
207
- ).sum(dim=1)
210
+
211
+ result = torch.empty((m, k), device=device, dtype=out_dtype)
212
+ return apply_shuffle_mul_sum(c2, result, c_map, topk_weights)
213
+
214
+
215
+ FLOAT4_E2M1_MAX = 6.0
216
+ FLOAT8_E4M3_MAX = 448.0
217
+
218
+
219
+ def cutlass_moe_fp4(
220
+ a: torch.Tensor,
221
+ a1_gscale: torch.Tensor,
222
+ w1_fp4: torch.Tensor,
223
+ w1_blockscale: torch.Tensor,
224
+ w1_alphas: torch.Tensor,
225
+ a2_gscale: torch.Tensor,
226
+ w2_fp4: torch.Tensor,
227
+ w2_blockscale: torch.Tensor,
228
+ w2_alphas: torch.Tensor,
229
+ topk_weights: torch.Tensor,
230
+ topk_ids: torch.Tensor,
231
+ params: CutlassMoEParams,
232
+ apply_router_weight_on_input: bool = False,
233
+ ):
234
+ """
235
+ MoE implementation for FP4 Inputs
236
+
237
+ # Gemm 1
238
+ a: Input tensor: [m, k] (half/bfloat16)
239
+ a1_gscale: Activation scale per expert: [e] (float32)
240
+ w1(gate up) (not an argument to cutlass_moe_fp4): [e, 2 * n, k]
241
+ w1_fp4: [e, 2 * n, k // 2], dtype: torch.uint8 (stacked fp4: E2M1)
242
+ (Note: `n` is the up projection output dim, `k` is the input dim in
243
+ full precision)
244
+ w1_blockscale: [e, 2 * n, k // block_size] (float8_e4m3)
245
+ (Block size = 16 for NVFP4)
246
+
247
+ # Gemm 2
248
+ a2_gscale: Activation scale per expert: [e]
249
+ w2(down projection) (not an argument to cutlass_moe_fp4): [e, k, n]
250
+ w2_fp4: [e, k, n // 2], dtype: torch.uint8 (stacked E2M1)
251
+ w2_blockscale: [e, k, n // block_size], dtype: float8_e4m3
252
+
253
+ Strides for activations, weights and output in logical number of elements.
254
+ The activations & output stride is the number of elements to the next row.
255
+ The weights stride is the number of elements to the next row per expert.
256
+ For example, if the weight is [e, n, k], then the b_stride is a tensor of
257
+ shape [e] with each element being k. Similarly for activations, if the
258
+ shape is [m, k], then the a_stride has shape [e] with each value k.
259
+ Similarly for output, if the output is [m, n], then the c_stride is a
260
+ tensor of shape [e] with each element being k.
261
+
262
+ Note: cutlass_fp4_group_mm is designed to accept the strides of
263
+ activations and weights to be the same, so it is passed in as a single
264
+ tensor.
265
+ ab_strides_13: [e] dtype: int64 [Gemm 1: Activation / Weight strides]
266
+ ab_strides_2: [e] dtype: int64 [Gemm 2: Activation / Weight strides]
267
+ c_strides_13: [e] dtype: int64 [Gemm 1: Output Strides]
268
+ c_strides_2: [e] dtype: int64 [Gemm 1: Output Strides]
269
+
270
+ topk_weights: [m, topk] dtype: float8
271
+ topk_ids: [m, topk] dtype: float8
272
+
273
+ m, n, k: Unquantized weight shapes, dtype: int
274
+ e: number of experts for the current rank, dtype: int
275
+ assumes that topk < k < n to satisfy - up/down projection expectations.
276
+ """
277
+ assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
278
+ assert w1_fp4.dtype == torch.uint8, "weight 1 must be uint8"
279
+ assert w2_fp4.dtype == torch.uint8, "weight 2 must be uint8"
280
+ assert (
281
+ w1_fp4.ndim == 3
282
+ and w2_fp4.ndim == 3
283
+ and w1_blockscale.ndim == 3
284
+ and w2_blockscale.ndim == 3
285
+ ), "All Weights must be of rank 3 for cutlass_moe_fp4"
286
+ m_a, k_a = a.shape
287
+ e_w1, nx2_w1, half_k_w1 = w1_fp4.shape
288
+ e_w2, k_w2, half_n_w2 = w2_fp4.shape
289
+
290
+ assert e_w1 == e_w2 and e_w1 == params.num_experts, (
291
+ "Number of experts must match",
292
+ " between weights.",
293
+ )
294
+ assert (
295
+ k_a // 2 == half_k_w1 and params.hidden_size == k_w2
296
+ ), "Hidden size mismatch between a, w1 and w2"
297
+ assert (
298
+ nx2_w1 == params.intermediate_size_per_partition * 2
299
+ and half_n_w2 == params.intermediate_size_per_partition // 2
300
+ ), ("mismatch in " "expected `n`")
301
+ assert 2 * half_k_w1 == k_w2, "Hidden size mismatch w2 and w1"
302
+ assert a.dtype in [torch.half, torch.bfloat16], "Invalid input dtype"
303
+
304
+ out_dtype = a.dtype
305
+ num_topk = topk_ids.shape[1]
306
+ device = a.device
307
+ a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
308
+ c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
309
+ prepare_moe_input(
310
+ topk_ids,
311
+ params.expert_offsets,
312
+ params.problem_sizes1,
313
+ params.problem_sizes2,
314
+ a_map,
315
+ c_map,
316
+ params.num_experts,
317
+ params.intermediate_size_per_partition,
318
+ params.hidden_size,
319
+ params.blockscale_offsets,
320
+ )
321
+
322
+ rep_a_fp4, rep_a_blockscale = scaled_fp4_experts_quant(
323
+ a,
324
+ a1_gscale,
325
+ params.expert_offsets,
326
+ params.blockscale_offsets,
327
+ num_topk,
328
+ expert_map=a_map,
329
+ )
330
+ c1 = cutlass_fp4_group_mm(
331
+ rep_a_fp4,
332
+ w1_fp4,
333
+ rep_a_blockscale,
334
+ w1_blockscale,
335
+ w1_alphas,
336
+ out_dtype,
337
+ device,
338
+ params.to_gemm1_args(),
339
+ )
340
+ del rep_a_fp4, rep_a_blockscale
341
+
342
+ # hidden size dimension is split to one halfpytho sized tensor.
343
+ intermediate = torch.empty(
344
+ (m_a * num_topk, w1_fp4.shape[1] // 2), device=device, dtype=out_dtype
345
+ )
346
+ silu_and_mul(c1, intermediate)
347
+
348
+ int_fp4, int_blockscale = scaled_fp4_experts_quant(
349
+ intermediate,
350
+ a2_gscale,
351
+ params.expert_offsets,
352
+ params.blockscale_offsets,
353
+ num_topk,
354
+ )
355
+ c2 = cutlass_fp4_group_mm(
356
+ int_fp4,
357
+ w2_fp4,
358
+ int_blockscale,
359
+ w2_blockscale,
360
+ w2_alphas,
361
+ out_dtype,
362
+ device,
363
+ params.to_gemm2_args(),
364
+ )
365
+ del int_fp4, int_blockscale
366
+ c2 = shuffle_rows(c2, c_map, (m_a * num_topk, params.hidden_size))
367
+ c2 = c2.view(m_a, num_topk, params.hidden_size)
368
+ if not apply_router_weight_on_input:
369
+ c2 = c2 * topk_weights.view(m_a, num_topk, 1).to(out_dtype)
370
+ return c2.sum(dim=1).to(out_dtype)
@@ -0,0 +1,169 @@
1
+ from dataclasses import dataclass
2
+ from enum import Enum, auto
3
+ from typing import Optional
4
+
5
+ import torch
6
+
7
+
8
+ class CutlassMoEType(Enum):
9
+ """
10
+ Enum for the different types of cutlass moe operations
11
+ that are currently supported in SGLang.
12
+ """
13
+
14
+ BlockscaledFP8 = auto()
15
+ BlockscaledFP4 = auto()
16
+
17
+
18
+ @dataclass
19
+ class CutlassMoEParams:
20
+ """
21
+ Parameters for the cutlass moe operation.
22
+ """
23
+
24
+ # Type as defined above
25
+ cutlass_moe_type: CutlassMoEType
26
+
27
+ # Strides for activations, weights and output in logical number of elements.
28
+ # The activations & output stride is the number of elements to the next row.
29
+ # The weights stride is the number of elements to the next row per expert.
30
+ # For example, if the weight is [e, n, k], then the b_stride is a tensor of
31
+ # shape [e] with each element being k. Similarly for activations, if the
32
+ # shape is [m, k], then the a_stride has shape [e] with each value k.
33
+ # Similarly for output, if the output is [m, n], then the c_stride is a
34
+ # tensor of shape [e] with each element being k.
35
+
36
+ # Note: cutlass_fp4_group_mm is designed to accept the strides of
37
+ # activations and weights to be the same, so it is passed in as a single
38
+ # tensor.
39
+ # ab_strides_13: [e] dtype: int64 [Gemm 1: Activation / Weight strides]
40
+ # ab_strides_2: [e] dtype: int64 [Gemm 2: Activation / Weight strides]
41
+ # c_strides_13: [e] dtype: int64 [Gemm 1: Output Strides]
42
+ # c_strides_2: [e] dtype: int64 [Gemm 2: Output Strides]
43
+ ab_strides_13: torch.Tensor
44
+ ab_strides_2: torch.Tensor
45
+ c_strides_13: torch.Tensor
46
+ c_strides_2: torch.Tensor
47
+
48
+ # m: Total number of tokens
49
+ # n: intermediate size per partition
50
+ # k: hidden size per expert
51
+ # e: Number of experts
52
+ # device: Device to run computation on and store tensors
53
+ m: int
54
+ intermediate_size_per_partition: int
55
+ hidden_size: int
56
+ num_experts: int
57
+ device: torch.device
58
+
59
+ # Pointers container for calculating offsets of the input activations for each expert
60
+ # a_ptrs: [e] dtype: int64
61
+ a_ptrs: torch.Tensor
62
+
63
+ # Pointers container for calculating offsets of the input weights for each expert
64
+ # b_ptrs: [e] dtype: int64
65
+ b_ptrs: torch.Tensor
66
+
67
+ # Pointers container for calculating offsets of the output activations for each expert
68
+ # out_ptrs: [e] dtype: int64
69
+ out_ptrs: torch.Tensor
70
+ # Pointers container for calculating offsets of the input scales for each expert
71
+ # a_scales_ptrs: [e] dtype: int64
72
+ # b_scales_ptrs: [e] dtype: int64
73
+ a_scales_ptrs: torch.Tensor
74
+ b_scales_ptrs: torch.Tensor
75
+
76
+ # Offsets that mark at which token index each expert begins its computation
77
+ # The number of tokens computed with expert E is expert_offsets[E + 1] - expert_offsets[E]
78
+ # expert_offsets: [e+1] dtype: int32
79
+ expert_offsets: torch.Tensor
80
+
81
+ # Problem size: (num_experts, (m,2n,k)) for first GEMM
82
+ # problem_sizes1: [e, 3] dtype: int32
83
+ # Problem size: (num_experts, (m,n,k)) for second GEMM
84
+ # problem_sizes2: [e, 3] dtype: int32
85
+ problem_sizes1: torch.Tensor
86
+ problem_sizes2: torch.Tensor
87
+ # Similar to expert_offsets, but for blockscales for FP4 blockscaled Group GEMM
88
+ blockscale_offsets: Optional[torch.Tensor] = None
89
+
90
+ def __init__(
91
+ self,
92
+ cutlass_moe_type: CutlassMoEType,
93
+ device: torch.device,
94
+ num_experts: int,
95
+ intermediate_size_per_partition: int,
96
+ hidden_size: int,
97
+ ):
98
+ self.cutlass_moe_type = cutlass_moe_type
99
+ self.device = device
100
+ self.num_experts = num_experts
101
+ self.intermediate_size_per_partition = intermediate_size_per_partition
102
+ self.hidden_size = hidden_size
103
+ self.n = self.intermediate_size_per_partition
104
+ self.k = self.hidden_size
105
+ self.e = self.num_experts
106
+ self.ab_strides_13 = torch.full(
107
+ (self.e,), self.k, dtype=torch.int64, device=self.device
108
+ )
109
+ self.ab_strides_2 = torch.full(
110
+ (self.e,), self.n, dtype=torch.int64, device=self.device
111
+ )
112
+ self.c_strides_13 = torch.full(
113
+ (self.e,), 2 * self.n, dtype=torch.int64, device=self.device
114
+ )
115
+ self.c_strides_2 = torch.full(
116
+ (self.e,), self.k, dtype=torch.int64, device=self.device
117
+ )
118
+ self.expert_offsets = torch.empty(
119
+ (self.e + 1,), dtype=torch.int32, device=self.device
120
+ )
121
+ self.problem_sizes1 = torch.empty(
122
+ (self.e, 3), dtype=torch.int32, device=self.device
123
+ )
124
+ self.problem_sizes2 = torch.empty(
125
+ (self.e, 3), dtype=torch.int32, device=self.device
126
+ )
127
+ if self.cutlass_moe_type == CutlassMoEType.BlockscaledFP4:
128
+ self.blockscale_offsets = torch.empty(
129
+ (self.e + 1,), dtype=torch.int32, device=self.device
130
+ )
131
+ else:
132
+ self.blockscale_offsets = None
133
+ self.a_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
134
+ self.b_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
135
+ self.out_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
136
+ self.a_scales_ptrs = torch.empty(
137
+ (self.e,), dtype=torch.int64, device=self.device
138
+ )
139
+ self.b_scales_ptrs = torch.empty(
140
+ (self.e,), dtype=torch.int64, device=self.device
141
+ )
142
+
143
+ def to_gemm1_args(self) -> dict:
144
+ return {
145
+ "ab_strides": self.ab_strides_13,
146
+ "c_strides": self.c_strides_13,
147
+ "problem_sizes": self.problem_sizes1,
148
+ "expert_offsets": self.expert_offsets[:-1],
149
+ "blockscale_offsets": self.blockscale_offsets[:-1],
150
+ # "a_ptrs": self.a_ptrs,
151
+ # "b_ptrs": self.b_ptrs,
152
+ # "out_ptrs": self.out_ptrs,
153
+ # "a_scales_ptrs": self.a_scales_ptrs,
154
+ # "b_scales_ptrs": self.b_scales_ptrs,
155
+ }
156
+
157
+ def to_gemm2_args(self) -> dict:
158
+ return {
159
+ "ab_strides": self.ab_strides_2,
160
+ "c_strides": self.c_strides_2,
161
+ "problem_sizes": self.problem_sizes2,
162
+ "expert_offsets": self.expert_offsets[:-1],
163
+ "blockscale_offsets": self.blockscale_offsets[:-1],
164
+ # "a_ptrs": self.a_ptrs,
165
+ # "b_ptrs": self.b_ptrs,
166
+ # "out_ptrs": self.out_ptrs,
167
+ # "a_scales_ptrs": self.a_scales_ptrs,
168
+ # "b_scales_ptrs": self.b_scales_ptrs,
169
+ }
@@ -4,6 +4,7 @@ from typing import List, Optional
4
4
  import torch
5
5
  import triton
6
6
 
7
+ from sglang.math_utils import ceil_div
7
8
  from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
8
9
  from sglang.srt.utils import dispose_tensor, is_cuda
9
10
 
@@ -15,11 +16,6 @@ if _is_cuda:
15
16
  sglang_per_token_group_quant_fp8 as per_token_group_quant_fp8,
16
17
  )
17
18
 
18
- try:
19
- from deep_gemm import ceil_div
20
- except ImportError:
21
- logger.error(f"Failed to import ceil_div from deep_gemm.")
22
-
23
19
  import triton.language as tl
24
20
 
25
21
 
@@ -178,26 +174,33 @@ def pre_reorder_triton_kernel(
178
174
  topk,
179
175
  hidden_size,
180
176
  BLOCK_SIZE: tl.constexpr,
177
+ use_per_token_if_dynamic: tl.constexpr,
181
178
  ):
182
179
  OutDtype = gateup_input_ptr.dtype.element_ty
183
180
 
184
181
  src_idx = tl.program_id(0)
185
182
  src2dst_ptr = src2dst_ptr + src_idx * topk
186
183
  topk_ids_ptr = topk_ids_ptr + src_idx * topk
187
-
188
184
  src_ptr = input_ptr + src_idx * hidden_size
185
+
186
+ vec = tl.arange(0, BLOCK_SIZE)
187
+
188
+ if a1_scales_ptr is not None and use_per_token_if_dynamic:
189
+ scale = 1.0 / tl.load(a1_scales_ptr + src_idx)
190
+
189
191
  for idx in range(topk):
190
192
  expert_id = tl.load(topk_ids_ptr + idx)
191
193
  if expert_id >= start_expert_id and expert_id <= end_expert_id:
192
194
  if a1_scales_ptr is not None:
193
- scale = 1.0 / tl.load(a1_scales_ptr + expert_id - start_expert_id)
195
+ if not use_per_token_if_dynamic:
196
+ scale = 1.0 / tl.load(a1_scales_ptr + expert_id - start_expert_id)
194
197
  else:
195
198
  scale = 1.0
196
199
 
197
200
  dst_idx = tl.load(src2dst_ptr + idx)
198
201
  dst_ptr = gateup_input_ptr + dst_idx * hidden_size
199
202
  for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
200
- offset = start_offset + tl.arange(0, BLOCK_SIZE)
203
+ offset = start_offset + vec
201
204
  mask = offset < hidden_size
202
205
  in_data = tl.load(src_ptr + offset, mask=mask).to(tl.float32)
203
206
  out_data = (in_data * scale).to(OutDtype)
@@ -271,6 +274,7 @@ def _silu_and_mul_post_quant_kernel(
271
274
  fp8_min,
272
275
  BLOCK_N: tl.constexpr,
273
276
  NUM_STAGE: tl.constexpr,
277
+ SCALE_UE8M0: tl.constexpr,
274
278
  ):
275
279
  expert_id = tl.program_id(2)
276
280
  token_id = tl.program_id(1)
@@ -312,6 +316,8 @@ def _silu_and_mul_post_quant_kernel(
312
316
  gate_up = up * gate
313
317
  _absmax = tl.maximum(tl.max(tl.abs(gate_up)), 1e-10)
314
318
  output_s = _absmax / fp8_max
319
+ if SCALE_UE8M0:
320
+ output_s = tl.exp2(tl.ceil(tl.log2(tl.abs(output_s))))
315
321
  output_q = tl.clamp(gate_up / output_s, fp8_min, fp8_max).to(
316
322
  output_ptr.dtype.element_ty
317
323
  )
@@ -332,6 +338,7 @@ def silu_and_mul_masked_post_quant_fwd(
332
338
  output_scale: torch.Tensor,
333
339
  quant_group_size: int,
334
340
  masked_m: torch.Tensor,
341
+ scale_ue8m0: bool = False,
335
342
  ):
336
343
  """
337
344
  input shape [expert_num, token_num_padded, hidden_dim]
@@ -388,6 +395,7 @@ def silu_and_mul_masked_post_quant_fwd(
388
395
  BLOCK_N=BLOCK_N,
389
396
  NUM_STAGE=NUM_STAGES,
390
397
  num_warps=num_warps,
398
+ SCALE_UE8M0=scale_ue8m0,
391
399
  )
392
400
  return
393
401
 
@@ -481,8 +489,11 @@ def post_reorder_triton_kernel(
481
489
 
482
490
  computed = False
483
491
  store_ptr = output_ptr + src_idx * hidden_size
492
+
493
+ vec = tl.arange(0, BLOCK_SIZE)
494
+
484
495
  for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
485
- offset = start_offset + tl.arange(0, BLOCK_SIZE)
496
+ offset = start_offset + vec
486
497
  mask = offset < hidden_size
487
498
 
488
499
  sum_vec = tl.zeros([BLOCK_SIZE], dtype=InDtype)
@@ -499,7 +510,7 @@ def post_reorder_triton_kernel(
499
510
 
500
511
  if computed == False:
501
512
  for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
502
- offset = start_offset + tl.arange(0, BLOCK_SIZE)
513
+ offset = start_offset + vec
503
514
  mask = offset < hidden_size
504
515
  tl.store(
505
516
  store_ptr + offset, tl.zeros([BLOCK_SIZE], dtype=InDtype), mask=mask
@@ -553,6 +564,7 @@ def grouped_gemm_triton_kernel(
553
564
  bs_stride_0: tl.constexpr,
554
565
  bs_stride_2: tl.constexpr,
555
566
  bs_stride_1: tl.constexpr,
567
+ use_per_token_if_dynamic: tl.constexpr,
556
568
  BLOCK_SIZE_M: tl.constexpr,
557
569
  BLOCK_SIZE_N: tl.constexpr,
558
570
  BLOCK_SIZE_K: tl.constexpr,
@@ -616,7 +628,10 @@ def grouped_gemm_triton_kernel(
616
628
  b_ptr += BLOCK_SIZE_K
617
629
 
618
630
  if use_fp8_w8a8 and not (group_k > 0 and group_n > 0):
619
- scale_a_value = tl.load(scale_a + expert_id)
631
+ if use_per_token_if_dynamic:
632
+ scale_a_value = tl.load(scale_a + (m_range_start + offs_am[:, None]))
633
+ else:
634
+ scale_a_value = tl.load(scale_a + expert_id)
620
635
  scale_b_value = tl.load(scale_b + expert_id)
621
636
  accumulator *= scale_a_value * scale_b_value
622
637
 
@@ -653,6 +668,7 @@ def grouped_gemm_triton(
653
668
  scale_b: torch.Tensor = None,
654
669
  block_shape: Optional[List[int]] = None,
655
670
  c_dtype=None,
671
+ use_per_token_if_dynamic: bool = True,
656
672
  ):
657
673
  assert weight_column_major == True # TODO: more
658
674
  if use_fp8_w8a8 and block_shape is None:
@@ -693,6 +709,11 @@ def grouped_gemm_triton(
693
709
  triton.cdiv(b.size(1), META["BLOCK_SIZE_N"]),
694
710
  )
695
711
 
712
+ if use_fp8_w8a8 and block_shape is None and use_per_token_if_dynamic:
713
+ assert (
714
+ scale_a.shape[0] == a.shape[0]
715
+ ), f"scale_a.shape: {scale_a.shape}, a.shape: {a.shape}"
716
+
696
717
  grouped_gemm_triton_kernel[grid](
697
718
  a,
698
719
  b,
@@ -716,6 +737,7 @@ def grouped_gemm_triton(
716
737
  scale_b.stride(0) if scale_b is not None and scale_b.ndim >= 2 else 0,
717
738
  scale_b.stride(2) if scale_b is not None and scale_b.ndim == 3 else 0,
718
739
  scale_b.stride(1) if scale_b is not None and scale_b.ndim >= 2 else 0,
740
+ use_per_token_if_dynamic,
719
741
  **config,
720
742
  )
721
743
  return c