sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -11,27 +11,29 @@ from tqdm.contrib.concurrent import thread_map
11
11
  from sglang.srt.server_args import ServerArgs
12
12
  from sglang.srt.utils import get_bool_env_var, get_device_sm, get_int_env_var, is_cuda
13
13
 
14
+ logger = logging.getLogger(__name__)
14
15
  _ENABLE_JIT_DEEPGEMM = False
15
- if is_cuda():
16
+
17
+ try:
16
18
  import deep_gemm
17
19
  from deep_gemm import get_num_sms
20
+ from deep_gemm.jit import build
18
21
  from deep_gemm.jit.compiler import get_nvcc_compiler
19
22
  from deep_gemm.jit_kernels.gemm import get_best_configs
20
23
  from deep_gemm.jit_kernels.runtime import FP8GemmRuntime, GemmType
21
- from deep_gemm.jit_kernels.tuner import jit_tuner
22
24
 
23
25
  sm_version = get_device_sm()
24
26
  if sm_version == 90:
25
27
  if get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true"):
26
28
  _ENABLE_JIT_DEEPGEMM = True
29
+ except ImportError:
30
+ logger.warning("Failed to import deepgemm, disable _ENABLE_JIT_DEEPGEMM.")
27
31
 
28
32
 
29
33
  def get_enable_jit_deepgemm():
30
34
  return _ENABLE_JIT_DEEPGEMM
31
35
 
32
36
 
33
- logger = logging.getLogger(__name__)
34
-
35
37
  _BUILTIN_M_LIST = list(range(1, 1024 * 16 + 1))
36
38
  _ENABLE_JIT_DEEPGEMM_PRECOMPILE = get_bool_env_var(
37
39
  "SGL_JIT_DEEPGEMM_PRECOMPILE", "true"
@@ -146,32 +148,28 @@ def _compile_grouped_gemm_nt_f8f8bf16_masked_one(
146
148
  block_k = 128
147
149
  num_tma_threads = 128
148
150
  num_math_threads_per_group = 128
151
+
149
152
  kwargs = {
153
+ "GEMM_TYPE": GemmType.GroupedMasked,
150
154
  "NUM_TMA_THREADS": num_tma_threads,
151
155
  "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
156
+ "N": n,
157
+ "K": k,
158
+ "NUM_GROUPS": 1,
159
+ "BLOCK_M": block_m,
160
+ "BLOCK_N": block_n,
152
161
  "BLOCK_K": block_k,
162
+ "SWIZZLE_D_MODE": smem_config[1],
163
+ "BLOCK_N_PADDING": smem_config[2],
164
+ "NUM_STAGES": num_stages,
165
+ "NUM_TMA_MULTICAST": tma_multicast_config[0],
166
+ "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
153
167
  "NUM_SMS": num_sms,
154
168
  "SMEM_SIZE": smem_config[0],
155
169
  }
156
- _, _ = jit_tuner.compile_and_tune(
157
- name="m_grouped_gemm_fp8_fp8_bf16_nt",
158
- keys={
159
- "N": n,
160
- "K": k,
161
- "BLOCK_M": block_m,
162
- "BLOCK_N": block_n,
163
- "SWIZZLE_D_MODE": smem_config[1],
164
- "BLOCK_N_PADDING": smem_config[2],
165
- "NUM_GROUPS": num_groups,
166
- "NUM_STAGES": num_stages,
167
- "NUM_TMA_MULTICAST": tma_multicast_config[0],
168
- "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
169
- "GEMM_TYPE": GemmType.GroupedMasked,
170
- },
171
- space=(),
172
- kwargs=kwargs,
173
- runtime_cls=FP8GemmRuntime,
174
- )
170
+
171
+ code = FP8GemmRuntime.generate(kwargs)
172
+ _ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
175
173
 
176
174
 
177
175
  def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
@@ -185,31 +183,26 @@ def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
185
183
  num_tma_threads = 128
186
184
  num_math_threads_per_group = 128
187
185
  kwargs = {
186
+ "GEMM_TYPE": GemmType.GroupedContiguous,
188
187
  "NUM_TMA_THREADS": num_tma_threads,
189
188
  "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
189
+ "N": n,
190
+ "K": k,
191
+ "NUM_GROUPS": 1,
192
+ "BLOCK_M": block_m,
193
+ "BLOCK_N": block_n,
190
194
  "BLOCK_K": block_k,
195
+ "SWIZZLE_D_MODE": smem_config[1],
196
+ "BLOCK_N_PADDING": smem_config[2],
197
+ "NUM_STAGES": num_stages,
198
+ "NUM_TMA_MULTICAST": tma_multicast_config[0],
199
+ "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
191
200
  "NUM_SMS": num_sms,
192
201
  "SMEM_SIZE": smem_config[0],
193
202
  }
194
- _, _ = jit_tuner.compile_and_tune(
195
- name="m_grouped_gemm_fp8_fp8_bf16_nt",
196
- keys={
197
- "N": n,
198
- "K": k,
199
- "BLOCK_M": block_m,
200
- "BLOCK_N": block_n,
201
- "SWIZZLE_D_MODE": smem_config[1],
202
- "BLOCK_N_PADDING": smem_config[2],
203
- "NUM_GROUPS": num_groups,
204
- "NUM_STAGES": num_stages,
205
- "NUM_TMA_MULTICAST": tma_multicast_config[0],
206
- "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
207
- "GEMM_TYPE": GemmType.GroupedContiguous,
208
- },
209
- space=(),
210
- kwargs=kwargs,
211
- runtime_cls=FP8GemmRuntime,
212
- )
203
+
204
+ code = FP8GemmRuntime.generate(kwargs)
205
+ _ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
213
206
 
214
207
 
215
208
  def _compile_gemm_nt_f8f8bf16_one(
@@ -226,28 +219,23 @@ def _compile_gemm_nt_f8f8bf16_one(
226
219
  "GEMM_TYPE": GemmType.Normal,
227
220
  "NUM_TMA_THREADS": num_tma_threads,
228
221
  "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
222
+ "N": n,
223
+ "K": k,
229
224
  "NUM_GROUPS": 1,
225
+ "BLOCK_M": block_m,
226
+ "BLOCK_N": block_n,
230
227
  "BLOCK_K": block_k,
228
+ "SWIZZLE_D_MODE": smem_config[1],
229
+ "BLOCK_N_PADDING": smem_config[2],
230
+ "NUM_STAGES": num_stages,
231
+ "NUM_TMA_MULTICAST": tma_multicast_config[0],
232
+ "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
231
233
  "NUM_SMS": num_sms,
232
234
  "SMEM_SIZE": smem_config[0],
233
235
  }
234
- _, _ = jit_tuner.compile_and_tune(
235
- name="gemm_fp8_fp8_bf16_nt",
236
- keys={
237
- "N": n,
238
- "K": k,
239
- "BLOCK_M": block_m,
240
- "BLOCK_N": block_n,
241
- "SWIZZLE_D_MODE": smem_config[1],
242
- "BLOCK_N_PADDING": smem_config[2],
243
- "NUM_STAGES": num_stages,
244
- "NUM_TMA_MULTICAST": tma_multicast_config[0],
245
- "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
246
- },
247
- space=(),
248
- kwargs=kwargs,
249
- runtime_cls=FP8GemmRuntime,
250
- )
236
+
237
+ code = FP8GemmRuntime.generate(kwargs)
238
+ _ = build("gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
251
239
 
252
240
 
253
241
  _KERNEL_HELPER_DICT: Dict[DeepGemmKernelType, DeepGemmKernelHelper] = {
@@ -389,3 +377,16 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
389
377
  RuntimeCache.get = __patched_func
390
378
  yield
391
379
  RuntimeCache.get = origin_func
380
+
381
+
382
+ @contextmanager
383
+ def configure_deep_gemm_num_sms(num_sms):
384
+ if num_sms is None:
385
+ yield
386
+ else:
387
+ original_num_sms = deep_gemm.get_num_sms()
388
+ deep_gemm.set_num_sms(num_sms)
389
+ try:
390
+ yield
391
+ finally:
392
+ deep_gemm.set_num_sms(original_num_sms)
@@ -49,8 +49,8 @@ from sglang.srt.layers.quantization.fp8_kernel import (
49
49
  )
50
50
  from sglang.srt.layers.quantization.fp8_utils import (
51
51
  apply_fp8_linear,
52
- apply_w8a8_block_fp8_linear,
53
52
  cutlass_fp8_supported,
53
+ dispatch_w8a8_block_fp8_linear,
54
54
  input_to_float8,
55
55
  normalize_e4m3fn_to_e4m3fnuz,
56
56
  )
@@ -62,6 +62,7 @@ from sglang.srt.layers.quantization.utils import (
62
62
  per_tensor_dequantize,
63
63
  requantize_with_max_scale,
64
64
  )
65
+ from sglang.srt.layers.utils import is_sm100_supported
65
66
  from sglang.srt.utils import (
66
67
  get_bool_env_var,
67
68
  is_cuda,
@@ -76,8 +77,8 @@ _is_cuda = is_cuda()
76
77
 
77
78
  _is_fp8_fnuz = is_fp8_fnuz()
78
79
 
79
- use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT")
80
- use_aiter_moe = get_bool_env_var("SGLANG_AITER_MOE")
80
+ _use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT")
81
+ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
81
82
 
82
83
  if _is_hip:
83
84
  from aiter import ActivationType, QuantType
@@ -208,6 +209,8 @@ class Fp8LinearMethod(LinearMethodBase):
208
209
  # Marlin doesn't support block-wise fp8
209
210
  self.use_marlin = False
210
211
 
212
+ self.w8a8_block_fp8_linear = dispatch_w8a8_block_fp8_linear()
213
+
211
214
  def create_weights(
212
215
  self,
213
216
  layer: torch.nn.Module,
@@ -416,7 +419,7 @@ class Fp8LinearMethod(LinearMethodBase):
416
419
  )
417
420
 
418
421
  if self.block_quant:
419
- return apply_w8a8_block_fp8_linear(
422
+ return self.w8a8_block_fp8_linear(
420
423
  input=x,
421
424
  weight=layer.weight,
422
425
  block_size=self.quant_config.weight_block_size,
@@ -470,6 +473,7 @@ class Fp8MoEMethod:
470
473
  def __init__(self, quant_config):
471
474
  self.quant_config = quant_config
472
475
  self.block_quant = self.quant_config.weight_block_size is not None
476
+ self.cutlass_fp8_supported = cutlass_fp8_supported()
473
477
 
474
478
  def create_weights(
475
479
  self,
@@ -483,7 +487,7 @@ class Fp8MoEMethod:
483
487
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
484
488
 
485
489
  if self.quant_config.is_checkpoint_fp8_serialized:
486
- params_dtype = torch.uint32 if use_hip_int4 else torch.float8_e4m3fn
490
+ params_dtype = torch.uint32 if _use_hip_int4 else torch.float8_e4m3fn
487
491
  tp_size = get_tensor_model_parallel_world_size()
488
492
  if self.block_quant:
489
493
  block_n, block_k = (
@@ -508,7 +512,7 @@ class Fp8MoEMethod:
508
512
  )
509
513
 
510
514
  # WEIGHTS
511
- if _is_hip and use_hip_int4:
515
+ if _is_hip and _use_hip_int4:
512
516
  # INT4 MoE weight - INT32 packed
513
517
  w13_weight = torch.nn.Parameter(
514
518
  torch.empty(
@@ -568,6 +572,63 @@ class Fp8MoEMethod:
568
572
  layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
569
573
  layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
570
574
  assert self.quant_config.activation_scheme == "dynamic"
575
+ if (
576
+ get_bool_env_var("SGLANG_CUTLASS_MOE")
577
+ and self.cutlass_fp8_supported
578
+ and is_sm100_supported()
579
+ ):
580
+ self.ab_strides1 = torch.full(
581
+ (num_experts,),
582
+ hidden_size,
583
+ device=w13_weight.device,
584
+ dtype=torch.int64,
585
+ )
586
+ self.c_strides1 = torch.full(
587
+ (num_experts,),
588
+ 2 * intermediate_size,
589
+ device=w13_weight.device,
590
+ dtype=torch.int64,
591
+ )
592
+ self.ab_strides2 = torch.full(
593
+ (num_experts,),
594
+ intermediate_size,
595
+ device=w2_weight.device,
596
+ dtype=torch.int64,
597
+ )
598
+ self.c_strides2 = torch.full(
599
+ (num_experts,),
600
+ hidden_size,
601
+ device=w2_weight.device,
602
+ dtype=torch.int64,
603
+ )
604
+ self.workspace = torch.empty(
605
+ 90000, device=w13_weight.device, dtype=torch.uint8
606
+ )
607
+ self.a_ptr = torch.empty(
608
+ num_experts, device=w13_weight.device, dtype=torch.int64
609
+ )
610
+ self.b_ptr = torch.empty(
611
+ num_experts, device=w13_weight.device, dtype=torch.int64
612
+ )
613
+ self.out_ptr = torch.empty(
614
+ num_experts, device=w13_weight.device, dtype=torch.int64
615
+ )
616
+ self.a_scales_ptr = torch.empty(
617
+ num_experts, device=w13_weight.device, dtype=torch.int64
618
+ )
619
+ self.b_scales_ptr = torch.empty(
620
+ num_experts, device=w13_weight.device, dtype=torch.int64
621
+ )
622
+ self.expert_offsets = torch.empty(
623
+ num_experts + 1, device=w13_weight.device, dtype=torch.int32
624
+ )
625
+ self.problem_sizes1 = torch.empty(
626
+ num_experts, 3, device=w13_weight.device, dtype=torch.int32
627
+ )
628
+ self.problem_sizes2 = torch.empty(
629
+ num_experts, 3, device=w13_weight.device, dtype=torch.int32
630
+ )
631
+
571
632
  else:
572
633
  # Allocate 2 scales for w1 and w3 respectively.
573
634
  # They will be combined to a single scale after weight loading.
@@ -580,7 +641,7 @@ class Fp8MoEMethod:
580
641
  layer.register_parameter("w13_weight_scale", w13_weight_scale)
581
642
  layer.register_parameter("w2_weight_scale", w2_weight_scale)
582
643
 
583
- if _is_hip: # and use_aiter_moe: TODO: add check back after triton kernel
644
+ if _is_hip: # _use_aiter: TODO: add check back after triton kernel
584
645
  # ROCm - using column scaling, duplicate scaling numbers in case per tensor scaling
585
646
  w13_weight_scale1 = torch.nn.Parameter(
586
647
  torch.ones(num_experts, 2 * intermediate_size, dtype=torch.float32),
@@ -607,7 +668,7 @@ class Fp8MoEMethod:
607
668
  set_weight_attrs(w13_weight_scale, extra_weight_attrs)
608
669
  set_weight_attrs(w2_weight_scale, extra_weight_attrs)
609
670
 
610
- if _is_hip and use_hip_int4:
671
+ if _is_hip and _use_hip_int4:
611
672
  extra_weight_attrs.update(
612
673
  {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
613
674
  )
@@ -639,7 +700,7 @@ class Fp8MoEMethod:
639
700
  layer.w2_input_scale = None
640
701
 
641
702
  def process_weights_after_loading(self, layer: Module) -> None:
642
- if _is_hip and use_hip_int4:
703
+ if _is_hip and _use_hip_int4:
643
704
  self.process_weights_hip_int4(layer)
644
705
  return
645
706
 
@@ -670,7 +731,7 @@ class Fp8MoEMethod:
670
731
  )
671
732
  layer.w2_input_scale = None
672
733
 
673
- if _is_hip and use_aiter_moe:
734
+ if _use_aiter:
674
735
  # Pre-shuffle weights
675
736
  layer.w13_weight.data = shuffle_weight(
676
737
  layer.w13_weight.contiguous(), (16, 16)
@@ -792,7 +853,7 @@ class Fp8MoEMethod:
792
853
  return
793
854
 
794
855
  def process_weights_hip_int4(self, layer: Module):
795
- # TODO: and use_aiter_moe: add after triton kernel added
856
+ # TODO: _use_aiter: add after triton kernel added
796
857
  # INT4-FP8 (INT4 MoE Weight, FP8 Compute)
797
858
  # Weight Permutation
798
859
  layer.w13_weight = torch.nn.Parameter(
@@ -839,7 +900,7 @@ class Fp8MoEMethod:
839
900
  padding_size, # Avoid circular import
840
901
  )
841
902
 
842
- if use_aiter_moe:
903
+ if _use_aiter:
843
904
  layer.w13_weight = torch.nn.Parameter(
844
905
  shuffle_weight(layer.w13_weight.data, (16, 16)),
845
906
  requires_grad=False,
@@ -850,7 +911,7 @@ class Fp8MoEMethod:
850
911
  requires_grad=False,
851
912
  )
852
913
  torch.cuda.empty_cache()
853
- # ROCm (use_aiter_moe): using column-wise scaling
914
+ # ROCm (_use_aiter): using column-wise scaling
854
915
  layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1)
855
916
  layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1)
856
917
  elif get_bool_env_var("SGLANG_MOE_PADDING"):
@@ -876,6 +937,7 @@ class Fp8MoEMethod:
876
937
  use_grouped_topk: bool,
877
938
  topk_group: Optional[int] = None,
878
939
  num_expert_group: Optional[int] = None,
940
+ num_fused_shared_experts: int = 0,
879
941
  custom_routing_function: Optional[Callable] = None,
880
942
  correction_bias: Optional[torch.Tensor] = None,
881
943
  activation: str = "silu",
@@ -896,6 +958,7 @@ class Fp8MoEMethod:
896
958
  renormalize=renormalize,
897
959
  topk_group=topk_group,
898
960
  num_expert_group=num_expert_group,
961
+ num_fused_shared_experts=num_fused_shared_experts,
899
962
  custom_routing_function=custom_routing_function,
900
963
  correction_bias=correction_bias,
901
964
  routed_scaling_factor=routed_scaling_factor,
@@ -913,6 +976,37 @@ class Fp8MoEMethod:
913
976
  if ret is not None:
914
977
  return ret
915
978
 
979
+ if (
980
+ get_bool_env_var("SGLANG_CUTLASS_MOE")
981
+ and self.cutlass_fp8_supported
982
+ and self.block_quant
983
+ and is_sm100_supported()
984
+ ):
985
+ from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
986
+
987
+ return cutlass_fused_experts_fp8(
988
+ x,
989
+ layer.w13_weight.transpose(1, 2),
990
+ layer.w2_weight.transpose(1, 2),
991
+ layer.w13_weight_scale_inv.transpose(1, 2),
992
+ layer.w2_weight_scale_inv.transpose(1, 2),
993
+ topk_weights,
994
+ topk_ids,
995
+ self.ab_strides1,
996
+ self.c_strides1,
997
+ self.ab_strides2,
998
+ self.c_strides2,
999
+ self.workspace,
1000
+ self.a_ptr,
1001
+ self.b_ptr,
1002
+ self.out_ptr,
1003
+ self.a_scales_ptr,
1004
+ self.b_scales_ptr,
1005
+ self.expert_offsets,
1006
+ self.problem_sizes1,
1007
+ self.problem_sizes2,
1008
+ use_fp8_blockscale=True,
1009
+ )
916
1010
  # Expert fusion with FP8 quantization
917
1011
  return fused_experts(
918
1012
  x,
@@ -936,6 +1030,7 @@ class Fp8MoEMethod:
936
1030
  a2_scale=layer.w2_input_scale,
937
1031
  block_shape=self.quant_config.weight_block_size,
938
1032
  no_combine=no_combine,
1033
+ routed_scaling_factor=routed_scaling_factor,
939
1034
  )
940
1035
 
941
1036
  def maybe_apply_hip_fused_experts(
@@ -947,8 +1042,8 @@ class Fp8MoEMethod:
947
1042
  activation: str = "silu",
948
1043
  no_combine: bool = False,
949
1044
  ) -> Optional[torch.Tensor]:
950
- if use_hip_int4:
951
- # TODO: add triton kernel and add check use_aiter_moe
1045
+ if _use_hip_int4:
1046
+ # TODO: add triton kernel and add check _use_aiter
952
1047
  assert not no_combine, f"{no_combine=} is not supported."
953
1048
  return ck_moe_2stages(
954
1049
  x,
@@ -964,13 +1059,13 @@ class Fp8MoEMethod:
964
1059
  ),
965
1060
  )
966
1061
 
967
- if use_aiter_moe:
1062
+ if _use_aiter:
968
1063
  assert not no_combine, f"{no_combine=} is not supported."
969
1064
  if self.block_quant:
970
- # TODO(use_aiter_moe): FP8 block_quant only supports 'silu' for the time-being.
1065
+ # TODO(_use_aiter): FP8 block_quant only supports 'silu' for the time-being.
971
1066
  assert (
972
1067
  activation == "silu"
973
- ), f"use_aiter_moe: FP8 bloack_quant {activation=} will be supported later, unset use_aiter_moe"
1068
+ ), f"_use_aiter: FP8 bloack_quant {activation=} will be supported later, unset _use_aiter"
974
1069
  return asm_moe(
975
1070
  x,
976
1071
  layer.w13_weight,
@@ -740,7 +740,59 @@ if _is_hip:
740
740
  return _w8a8_block_fp8_matmul
741
741
 
742
742
 
743
- def w8a8_block_fp8_matmul(
743
+ def prepare_block_fp8_matmul_inputs(
744
+ A: torch.Tensor,
745
+ B: torch.Tensor,
746
+ As: torch.Tensor,
747
+ Bs: torch.Tensor,
748
+ block_size: List[int],
749
+ output_dtype: torch.dtype = torch.float16,
750
+ ) -> Tuple[int, int, int]:
751
+ assert len(block_size) == 2
752
+ block_n, block_k = block_size[0], block_size[1]
753
+
754
+ assert A.shape[-1] == B.shape[-1]
755
+ assert A.shape[:-1] == As.shape[:-1]
756
+ assert A.is_contiguous()
757
+ assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
758
+
759
+ M = A.numel() // A.shape[-1]
760
+
761
+ assert B.ndim == 2
762
+ assert B.is_contiguous()
763
+ assert Bs.ndim == 2
764
+ N, K = B.shape
765
+ assert triton.cdiv(N, block_n) == Bs.shape[0]
766
+ assert triton.cdiv(K, block_k) == Bs.shape[1]
767
+
768
+ C_shape = A.shape[:-1] + (N,)
769
+ C = A.new_empty(C_shape, dtype=output_dtype)
770
+
771
+ return M, N, K, C
772
+
773
+
774
+ def w8a8_block_fp8_matmul_deepgemm(
775
+ A: torch.Tensor,
776
+ B: torch.Tensor,
777
+ As: torch.Tensor,
778
+ Bs: torch.Tensor,
779
+ block_size: List[int],
780
+ output_dtype: torch.dtype,
781
+ ) -> torch.Tensor:
782
+ M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, output_dtype)
783
+
784
+ # Deepgemm only supports output tensor type as bfloat16
785
+ assert C.dtype == torch.bfloat16 and _ENABLE_JIT_DEEPGEMM
786
+
787
+ if supports_custom_op():
788
+ torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
789
+ else:
790
+ deep_gemm_gemm_nt_f8f8bf16((A, As), (B, Bs), C)
791
+
792
+ return C
793
+
794
+
795
+ def w8a8_block_fp8_matmul_triton(
744
796
  A: torch.Tensor,
745
797
  B: torch.Tensor,
746
798
  As: torch.Tensor,
@@ -764,81 +816,81 @@ def w8a8_block_fp8_matmul(
764
816
  Returns:
765
817
  torch.Tensor: The result of matmul.
766
818
  """
767
- assert len(block_size) == 2
768
- block_n, block_k = block_size[0], block_size[1]
769
-
770
- assert A.shape[-1] == B.shape[-1]
771
- assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
772
- assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
773
- M = A.numel() // A.shape[-1]
774
819
 
775
- assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
776
- N, K = B.shape
777
- assert triton.cdiv(N, block_n) == Bs.shape[0]
778
- assert triton.cdiv(K, block_k) == Bs.shape[1]
820
+ M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, output_dtype)
779
821
 
780
- C_shape = A.shape[:-1] + (N,)
781
- C = A.new_empty(C_shape, dtype=output_dtype)
822
+ block_n, block_k = block_size
782
823
 
783
- # deepgemm only support bf16
784
- if C.dtype == torch.bfloat16 and _ENABLE_JIT_DEEPGEMM:
785
- if supports_custom_op():
786
- torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
787
- else:
788
- deep_gemm_gemm_nt_f8f8bf16((A, As), (B, Bs), C)
824
+ configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
825
+ if configs:
826
+ # If an optimal configuration map has been found, look up the
827
+ # optimal config
828
+ config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
789
829
  else:
790
- configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
791
- if configs:
792
- # If an optimal configuration map has been found, look up the
793
- # optimal config
794
- config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
795
- else:
796
- # Default config
797
- # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
798
- config = {
799
- "BLOCK_SIZE_M": 64,
800
- "BLOCK_SIZE_N": block_size[0],
801
- "BLOCK_SIZE_K": block_size[1],
802
- "GROUP_SIZE_M": 32,
803
- "num_warps": 4,
804
- "num_stages": 3,
805
- }
806
-
807
- def grid(META):
808
- return (
809
- triton.cdiv(M, META["BLOCK_SIZE_M"])
810
- * triton.cdiv(N, META["BLOCK_SIZE_N"]),
811
- )
830
+ # Default config
831
+ # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
832
+ config = {
833
+ "BLOCK_SIZE_M": 64,
834
+ "BLOCK_SIZE_N": block_size[0],
835
+ "BLOCK_SIZE_K": block_size[1],
836
+ "GROUP_SIZE_M": 32,
837
+ "num_warps": 4,
838
+ "num_stages": 3,
839
+ }
840
+
841
+ def grid(META):
842
+ return (
843
+ triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
844
+ )
812
845
 
813
- kernel = select_w8a8_block_fp8_matmul_kernel(M, N, config)
846
+ kernel = select_w8a8_block_fp8_matmul_kernel(M, N, config)
814
847
 
815
- kernel[grid](
816
- A,
817
- B,
818
- C,
819
- As,
820
- Bs,
821
- M,
822
- N,
823
- K,
824
- block_n,
825
- block_k,
826
- A.stride(-2),
827
- A.stride(-1),
828
- B.stride(1),
829
- B.stride(0),
830
- C.stride(-2),
831
- C.stride(-1),
832
- As.stride(-2),
833
- As.stride(-1),
834
- Bs.stride(1),
835
- Bs.stride(0),
836
- **config,
837
- )
848
+ kernel[grid](
849
+ A,
850
+ B,
851
+ C,
852
+ As,
853
+ Bs,
854
+ M,
855
+ N,
856
+ K,
857
+ block_n,
858
+ block_k,
859
+ A.stride(-2),
860
+ A.stride(-1),
861
+ B.stride(1),
862
+ B.stride(0),
863
+ C.stride(-2),
864
+ C.stride(-1),
865
+ As.stride(-2),
866
+ As.stride(-1),
867
+ Bs.stride(1),
868
+ Bs.stride(0),
869
+ **config,
870
+ )
838
871
 
839
872
  return C
840
873
 
841
874
 
875
+ # universal entry point, for testing purposes
876
+ def w8a8_block_fp8_matmul(
877
+ A: torch.Tensor,
878
+ B: torch.Tensor,
879
+ As: torch.Tensor,
880
+ Bs: torch.Tensor,
881
+ block_size: List[int],
882
+ output_dtype: torch.dtype = torch.float16,
883
+ ) -> torch.Tensor:
884
+ if output_dtype == torch.bfloat16 and _ENABLE_JIT_DEEPGEMM:
885
+ return w8a8_block_fp8_matmul_deepgemm(
886
+ A, B, As, Bs, block_size, output_dtype=output_dtype
887
+ )
888
+
889
+ return w8a8_block_fp8_matmul_triton(
890
+ A, B, As, Bs, block_size, output_dtype=output_dtype
891
+ )
892
+
893
+
842
894
  @triton.jit
843
895
  def _per_tensor_quant_mla_fp8_stage1(
844
896
  x_ptr,