sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. sglang/bench_offline_throughput.py +10 -4
  2. sglang/bench_one_batch_server.py +67 -11
  3. sglang/bench_serving.py +85 -74
  4. sglang/lang/backend/runtime_endpoint.py +24 -1
  5. sglang/profiler.py +167 -0
  6. sglang/srt/_custom_ops.py +34 -0
  7. sglang/srt/configs/internvl.py +8 -12
  8. sglang/srt/configs/model_config.py +27 -1
  9. sglang/srt/constrained/base_grammar_backend.py +5 -2
  10. sglang/srt/constrained/llguidance_backend.py +9 -8
  11. sglang/srt/constrained/outlines_backend.py +5 -4
  12. sglang/srt/constrained/xgrammar_backend.py +18 -18
  13. sglang/srt/conversation.py +46 -8
  14. sglang/srt/custom_op.py +38 -3
  15. sglang/srt/debug_utils.py +74 -0
  16. sglang/srt/disaggregation/common/__init__.py +1 -0
  17. sglang/srt/disaggregation/common/conn.py +407 -0
  18. sglang/srt/disaggregation/decode.py +67 -3
  19. sglang/srt/disaggregation/fake/conn.py +1 -0
  20. sglang/srt/disaggregation/kv_events.py +60 -5
  21. sglang/srt/disaggregation/launch_lb.py +140 -0
  22. sglang/srt/disaggregation/mini_lb.py +29 -48
  23. sglang/srt/disaggregation/mooncake/conn.py +432 -140
  24. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  25. sglang/srt/disaggregation/nixl/conn.py +124 -432
  26. sglang/srt/disaggregation/prefill.py +2 -0
  27. sglang/srt/disaggregation/utils.py +38 -1
  28. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  29. sglang/srt/distributed/parallel_state.py +52 -5
  30. sglang/srt/entrypoints/EngineBase.py +6 -0
  31. sglang/srt/entrypoints/engine.py +102 -5
  32. sglang/srt/entrypoints/http_server.py +15 -2
  33. sglang/srt/function_call/base_format_detector.py +138 -86
  34. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  35. sglang/srt/function_call/ebnf_composer.py +33 -19
  36. sglang/srt/function_call/function_call_parser.py +27 -0
  37. sglang/srt/function_call/llama32_detector.py +33 -14
  38. sglang/srt/function_call/mistral_detector.py +73 -26
  39. sglang/srt/function_call/pythonic_detector.py +86 -20
  40. sglang/srt/function_call/qwen25_detector.py +64 -10
  41. sglang/srt/function_call/utils.py +17 -0
  42. sglang/srt/hf_transformers_utils.py +4 -0
  43. sglang/srt/layers/attention/aiter_backend.py +488 -123
  44. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  45. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  46. sglang/srt/layers/attention/flashattention_backend.py +103 -18
  47. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  48. sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
  49. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  50. sglang/srt/layers/attention/tbo_backend.py +232 -0
  51. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  52. sglang/srt/layers/attention/triton_backend.py +244 -5
  53. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  54. sglang/srt/layers/communicator.py +260 -194
  55. sglang/srt/layers/dp_attention.py +6 -5
  56. sglang/srt/layers/layernorm.py +30 -19
  57. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  58. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  59. sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
  60. sglang/srt/layers/moe/ep_moe/layer.py +94 -40
  61. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
  62. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  71. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  72. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  73. sglang/srt/layers/moe/topk.py +44 -18
  74. sglang/srt/layers/multimodal.py +3 -3
  75. sglang/srt/layers/quantization/__init__.py +3 -2
  76. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  77. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  78. sglang/srt/layers/quantization/deep_gemm.py +55 -56
  79. sglang/srt/layers/quantization/fp8.py +28 -23
  80. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  81. sglang/srt/layers/quantization/fp8_utils.py +165 -49
  82. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  83. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  84. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  85. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  86. sglang/srt/layers/rotary_embedding.py +6 -12
  87. sglang/srt/layers/sampler.py +80 -79
  88. sglang/srt/layers/utils.py +6 -0
  89. sglang/srt/lora/layers.py +12 -15
  90. sglang/srt/lora/lora.py +49 -5
  91. sglang/srt/lora/lora_manager.py +19 -5
  92. sglang/srt/lora/mem_pool.py +24 -16
  93. sglang/srt/lora/utils.py +17 -13
  94. sglang/srt/managers/data_parallel_controller.py +13 -5
  95. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  96. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  97. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  98. sglang/srt/managers/eplb_manager.py +55 -14
  99. sglang/srt/managers/expert_distribution.py +220 -46
  100. sglang/srt/managers/expert_location.py +110 -56
  101. sglang/srt/managers/expert_location_dispatch.py +23 -6
  102. sglang/srt/managers/io_struct.py +15 -4
  103. sglang/srt/managers/mm_utils.py +88 -38
  104. sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
  105. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  106. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  107. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  108. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  109. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  110. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  111. sglang/srt/managers/schedule_batch.py +140 -38
  112. sglang/srt/managers/scheduler.py +305 -112
  113. sglang/srt/managers/tokenizer_manager.py +134 -17
  114. sglang/srt/managers/utils.py +0 -4
  115. sglang/srt/metrics/collector.py +9 -0
  116. sglang/srt/model_executor/cuda_graph_runner.py +72 -61
  117. sglang/srt/model_executor/expert_location_updater.py +157 -22
  118. sglang/srt/model_executor/forward_batch_info.py +38 -17
  119. sglang/srt/model_executor/model_runner.py +96 -56
  120. sglang/srt/model_loader/utils.py +67 -1
  121. sglang/srt/models/deepseek_nextn.py +1 -1
  122. sglang/srt/models/deepseek_v2.py +609 -234
  123. sglang/srt/models/gemma3_causal.py +7 -0
  124. sglang/srt/models/gemma3_mm.py +19 -14
  125. sglang/srt/models/idefics2.py +342 -0
  126. sglang/srt/models/kimi_vl.py +4 -4
  127. sglang/srt/models/llama.py +1 -1
  128. sglang/srt/models/minicpmo.py +2 -5
  129. sglang/srt/models/minicpmv.py +3 -295
  130. sglang/srt/models/phi4mm.py +512 -0
  131. sglang/srt/models/qwen2.py +38 -9
  132. sglang/srt/models/qwen2_5_vl.py +3 -9
  133. sglang/srt/models/qwen2_eagle.py +4 -1
  134. sglang/srt/models/qwen2_moe.py +58 -191
  135. sglang/srt/models/qwen2_vl.py +3 -9
  136. sglang/srt/models/qwen3.py +41 -10
  137. sglang/srt/models/qwen3_moe.py +230 -191
  138. sglang/srt/models/registry.py +9 -1
  139. sglang/srt/models/transformers.py +291 -0
  140. sglang/srt/openai_api/adapter.py +86 -24
  141. sglang/srt/openai_api/protocol.py +31 -2
  142. sglang/srt/openai_api/utils.py +172 -0
  143. sglang/srt/operations.py +37 -2
  144. sglang/srt/operations_strategy.py +200 -24
  145. sglang/srt/sampling/sampling_batch_info.py +13 -1
  146. sglang/srt/sampling/sampling_params.py +2 -1
  147. sglang/srt/server_args.py +114 -27
  148. sglang/srt/speculative/build_eagle_tree.py +8 -8
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  150. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  151. sglang/srt/speculative/eagle_utils.py +51 -91
  152. sglang/srt/speculative/eagle_worker.py +101 -21
  153. sglang/srt/two_batch_overlap.py +635 -0
  154. sglang/srt/utils.py +129 -7
  155. sglang/test/runners.py +16 -7
  156. sglang/test/send_one.py +4 -0
  157. sglang/test/test_cutlass_moe.py +3 -3
  158. sglang/test/test_fp4_moe.py +248 -0
  159. sglang/test/test_utils.py +79 -6
  160. sglang/version.py +1 -1
  161. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
  162. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
  163. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  164. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  165. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  166. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  167. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  168. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  169. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  170. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  171. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  172. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  173. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  174. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  175. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  176. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  177. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  178. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  179. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  180. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  181. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  182. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  183. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  184. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  185. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  186. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  187. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  188. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  189. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  190. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  191. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  192. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  193. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  194. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  195. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  196. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  197. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  198. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  199. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  200. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  201. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  202. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  317. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  318. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -49,10 +49,9 @@ from sglang.srt.layers.quantization.fp8_kernel import (
49
49
  )
50
50
  from sglang.srt.layers.quantization.fp8_utils import (
51
51
  apply_fp8_linear,
52
- apply_w8a8_block_fp8_linear,
53
52
  cutlass_fp8_supported,
53
+ dispatch_w8a8_block_fp8_linear,
54
54
  input_to_float8,
55
- is_sm100_supported,
56
55
  normalize_e4m3fn_to_e4m3fnuz,
57
56
  )
58
57
  from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
@@ -63,6 +62,7 @@ from sglang.srt.layers.quantization.utils import (
63
62
  per_tensor_dequantize,
64
63
  requantize_with_max_scale,
65
64
  )
65
+ from sglang.srt.layers.utils import is_sm100_supported
66
66
  from sglang.srt.utils import (
67
67
  get_bool_env_var,
68
68
  is_cuda,
@@ -77,8 +77,8 @@ _is_cuda = is_cuda()
77
77
 
78
78
  _is_fp8_fnuz = is_fp8_fnuz()
79
79
 
80
- use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT")
81
- use_aiter_moe = get_bool_env_var("SGLANG_AITER_MOE")
80
+ _use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT")
81
+ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
82
82
 
83
83
  if _is_hip:
84
84
  from aiter import ActivationType, QuantType
@@ -209,6 +209,8 @@ class Fp8LinearMethod(LinearMethodBase):
209
209
  # Marlin doesn't support block-wise fp8
210
210
  self.use_marlin = False
211
211
 
212
+ self.w8a8_block_fp8_linear = dispatch_w8a8_block_fp8_linear()
213
+
212
214
  def create_weights(
213
215
  self,
214
216
  layer: torch.nn.Module,
@@ -417,7 +419,7 @@ class Fp8LinearMethod(LinearMethodBase):
417
419
  )
418
420
 
419
421
  if self.block_quant:
420
- return apply_w8a8_block_fp8_linear(
422
+ return self.w8a8_block_fp8_linear(
421
423
  input=x,
422
424
  weight=layer.weight,
423
425
  block_size=self.quant_config.weight_block_size,
@@ -485,7 +487,7 @@ class Fp8MoEMethod:
485
487
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
486
488
 
487
489
  if self.quant_config.is_checkpoint_fp8_serialized:
488
- params_dtype = torch.uint32 if use_hip_int4 else torch.float8_e4m3fn
490
+ params_dtype = torch.uint32 if _use_hip_int4 else torch.float8_e4m3fn
489
491
  tp_size = get_tensor_model_parallel_world_size()
490
492
  if self.block_quant:
491
493
  block_n, block_k = (
@@ -510,7 +512,7 @@ class Fp8MoEMethod:
510
512
  )
511
513
 
512
514
  # WEIGHTS
513
- if _is_hip and use_hip_int4:
515
+ if _is_hip and _use_hip_int4:
514
516
  # INT4 MoE weight - INT32 packed
515
517
  w13_weight = torch.nn.Parameter(
516
518
  torch.empty(
@@ -571,7 +573,7 @@ class Fp8MoEMethod:
571
573
  layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
572
574
  assert self.quant_config.activation_scheme == "dynamic"
573
575
  if (
574
- get_bool_env_var("CUTLASS_MOE")
576
+ get_bool_env_var("SGLANG_CUTLASS_MOE")
575
577
  and self.cutlass_fp8_supported
576
578
  and is_sm100_supported()
577
579
  ):
@@ -639,7 +641,7 @@ class Fp8MoEMethod:
639
641
  layer.register_parameter("w13_weight_scale", w13_weight_scale)
640
642
  layer.register_parameter("w2_weight_scale", w2_weight_scale)
641
643
 
642
- if _is_hip: # and use_aiter_moe: TODO: add check back after triton kernel
644
+ if _is_hip: # _use_aiter: TODO: add check back after triton kernel
643
645
  # ROCm - using column scaling, duplicate scaling numbers in case per tensor scaling
644
646
  w13_weight_scale1 = torch.nn.Parameter(
645
647
  torch.ones(num_experts, 2 * intermediate_size, dtype=torch.float32),
@@ -666,7 +668,7 @@ class Fp8MoEMethod:
666
668
  set_weight_attrs(w13_weight_scale, extra_weight_attrs)
667
669
  set_weight_attrs(w2_weight_scale, extra_weight_attrs)
668
670
 
669
- if _is_hip and use_hip_int4:
671
+ if _is_hip and _use_hip_int4:
670
672
  extra_weight_attrs.update(
671
673
  {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
672
674
  )
@@ -698,7 +700,7 @@ class Fp8MoEMethod:
698
700
  layer.w2_input_scale = None
699
701
 
700
702
  def process_weights_after_loading(self, layer: Module) -> None:
701
- if _is_hip and use_hip_int4:
703
+ if _is_hip and _use_hip_int4:
702
704
  self.process_weights_hip_int4(layer)
703
705
  return
704
706
 
@@ -729,7 +731,7 @@ class Fp8MoEMethod:
729
731
  )
730
732
  layer.w2_input_scale = None
731
733
 
732
- if _is_hip and use_aiter_moe:
734
+ if _use_aiter:
733
735
  # Pre-shuffle weights
734
736
  layer.w13_weight.data = shuffle_weight(
735
737
  layer.w13_weight.contiguous(), (16, 16)
@@ -851,7 +853,7 @@ class Fp8MoEMethod:
851
853
  return
852
854
 
853
855
  def process_weights_hip_int4(self, layer: Module):
854
- # TODO: and use_aiter_moe: add after triton kernel added
856
+ # TODO: _use_aiter: add after triton kernel added
855
857
  # INT4-FP8 (INT4 MoE Weight, FP8 Compute)
856
858
  # Weight Permutation
857
859
  layer.w13_weight = torch.nn.Parameter(
@@ -898,7 +900,7 @@ class Fp8MoEMethod:
898
900
  padding_size, # Avoid circular import
899
901
  )
900
902
 
901
- if use_aiter_moe:
903
+ if _use_aiter:
902
904
  layer.w13_weight = torch.nn.Parameter(
903
905
  shuffle_weight(layer.w13_weight.data, (16, 16)),
904
906
  requires_grad=False,
@@ -909,7 +911,7 @@ class Fp8MoEMethod:
909
911
  requires_grad=False,
910
912
  )
911
913
  torch.cuda.empty_cache()
912
- # ROCm (use_aiter_moe): using column-wise scaling
914
+ # ROCm (_use_aiter): using column-wise scaling
913
915
  layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1)
914
916
  layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1)
915
917
  elif get_bool_env_var("SGLANG_MOE_PADDING"):
@@ -935,6 +937,7 @@ class Fp8MoEMethod:
935
937
  use_grouped_topk: bool,
936
938
  topk_group: Optional[int] = None,
937
939
  num_expert_group: Optional[int] = None,
940
+ num_fused_shared_experts: int = 0,
938
941
  custom_routing_function: Optional[Callable] = None,
939
942
  correction_bias: Optional[torch.Tensor] = None,
940
943
  activation: str = "silu",
@@ -955,6 +958,7 @@ class Fp8MoEMethod:
955
958
  renormalize=renormalize,
956
959
  topk_group=topk_group,
957
960
  num_expert_group=num_expert_group,
961
+ num_fused_shared_experts=num_fused_shared_experts,
958
962
  custom_routing_function=custom_routing_function,
959
963
  correction_bias=correction_bias,
960
964
  routed_scaling_factor=routed_scaling_factor,
@@ -973,14 +977,14 @@ class Fp8MoEMethod:
973
977
  return ret
974
978
 
975
979
  if (
976
- get_bool_env_var("CUTLASS_MOE")
980
+ get_bool_env_var("SGLANG_CUTLASS_MOE")
977
981
  and self.cutlass_fp8_supported
978
982
  and self.block_quant
979
983
  and is_sm100_supported()
980
984
  ):
981
- from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts
985
+ from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
982
986
 
983
- return cutlass_fused_experts(
987
+ return cutlass_fused_experts_fp8(
984
988
  x,
985
989
  layer.w13_weight.transpose(1, 2),
986
990
  layer.w2_weight.transpose(1, 2),
@@ -1026,6 +1030,7 @@ class Fp8MoEMethod:
1026
1030
  a2_scale=layer.w2_input_scale,
1027
1031
  block_shape=self.quant_config.weight_block_size,
1028
1032
  no_combine=no_combine,
1033
+ routed_scaling_factor=routed_scaling_factor,
1029
1034
  )
1030
1035
 
1031
1036
  def maybe_apply_hip_fused_experts(
@@ -1037,8 +1042,8 @@ class Fp8MoEMethod:
1037
1042
  activation: str = "silu",
1038
1043
  no_combine: bool = False,
1039
1044
  ) -> Optional[torch.Tensor]:
1040
- if use_hip_int4:
1041
- # TODO: add triton kernel and add check use_aiter_moe
1045
+ if _use_hip_int4:
1046
+ # TODO: add triton kernel and add check _use_aiter
1042
1047
  assert not no_combine, f"{no_combine=} is not supported."
1043
1048
  return ck_moe_2stages(
1044
1049
  x,
@@ -1054,13 +1059,13 @@ class Fp8MoEMethod:
1054
1059
  ),
1055
1060
  )
1056
1061
 
1057
- if use_aiter_moe:
1062
+ if _use_aiter:
1058
1063
  assert not no_combine, f"{no_combine=} is not supported."
1059
1064
  if self.block_quant:
1060
- # TODO(use_aiter_moe): FP8 block_quant only supports 'silu' for the time-being.
1065
+ # TODO(_use_aiter): FP8 block_quant only supports 'silu' for the time-being.
1061
1066
  assert (
1062
1067
  activation == "silu"
1063
- ), f"use_aiter_moe: FP8 bloack_quant {activation=} will be supported later, unset use_aiter_moe"
1068
+ ), f"_use_aiter: FP8 bloack_quant {activation=} will be supported later, unset _use_aiter"
1064
1069
  return asm_moe(
1065
1070
  x,
1066
1071
  layer.w13_weight,
@@ -740,7 +740,59 @@ if _is_hip:
740
740
  return _w8a8_block_fp8_matmul
741
741
 
742
742
 
743
- def w8a8_block_fp8_matmul(
743
+ def prepare_block_fp8_matmul_inputs(
744
+ A: torch.Tensor,
745
+ B: torch.Tensor,
746
+ As: torch.Tensor,
747
+ Bs: torch.Tensor,
748
+ block_size: List[int],
749
+ output_dtype: torch.dtype = torch.float16,
750
+ ) -> Tuple[int, int, int]:
751
+ assert len(block_size) == 2
752
+ block_n, block_k = block_size[0], block_size[1]
753
+
754
+ assert A.shape[-1] == B.shape[-1]
755
+ assert A.shape[:-1] == As.shape[:-1]
756
+ assert A.is_contiguous()
757
+ assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
758
+
759
+ M = A.numel() // A.shape[-1]
760
+
761
+ assert B.ndim == 2
762
+ assert B.is_contiguous()
763
+ assert Bs.ndim == 2
764
+ N, K = B.shape
765
+ assert triton.cdiv(N, block_n) == Bs.shape[0]
766
+ assert triton.cdiv(K, block_k) == Bs.shape[1]
767
+
768
+ C_shape = A.shape[:-1] + (N,)
769
+ C = A.new_empty(C_shape, dtype=output_dtype)
770
+
771
+ return M, N, K, C
772
+
773
+
774
+ def w8a8_block_fp8_matmul_deepgemm(
775
+ A: torch.Tensor,
776
+ B: torch.Tensor,
777
+ As: torch.Tensor,
778
+ Bs: torch.Tensor,
779
+ block_size: List[int],
780
+ output_dtype: torch.dtype,
781
+ ) -> torch.Tensor:
782
+ M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, output_dtype)
783
+
784
+ # Deepgemm only supports output tensor type as bfloat16
785
+ assert C.dtype == torch.bfloat16 and _ENABLE_JIT_DEEPGEMM
786
+
787
+ if supports_custom_op():
788
+ torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
789
+ else:
790
+ deep_gemm_gemm_nt_f8f8bf16((A, As), (B, Bs), C)
791
+
792
+ return C
793
+
794
+
795
+ def w8a8_block_fp8_matmul_triton(
744
796
  A: torch.Tensor,
745
797
  B: torch.Tensor,
746
798
  As: torch.Tensor,
@@ -764,81 +816,81 @@ def w8a8_block_fp8_matmul(
764
816
  Returns:
765
817
  torch.Tensor: The result of matmul.
766
818
  """
767
- assert len(block_size) == 2
768
- block_n, block_k = block_size[0], block_size[1]
769
-
770
- assert A.shape[-1] == B.shape[-1]
771
- assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
772
- assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
773
- M = A.numel() // A.shape[-1]
774
819
 
775
- assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
776
- N, K = B.shape
777
- assert triton.cdiv(N, block_n) == Bs.shape[0]
778
- assert triton.cdiv(K, block_k) == Bs.shape[1]
820
+ M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, output_dtype)
779
821
 
780
- C_shape = A.shape[:-1] + (N,)
781
- C = A.new_empty(C_shape, dtype=output_dtype)
822
+ block_n, block_k = block_size
782
823
 
783
- # deepgemm only support bf16
784
- if C.dtype == torch.bfloat16 and _ENABLE_JIT_DEEPGEMM:
785
- if supports_custom_op():
786
- torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
787
- else:
788
- deep_gemm_gemm_nt_f8f8bf16((A, As), (B, Bs), C)
824
+ configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
825
+ if configs:
826
+ # If an optimal configuration map has been found, look up the
827
+ # optimal config
828
+ config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
789
829
  else:
790
- configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
791
- if configs:
792
- # If an optimal configuration map has been found, look up the
793
- # optimal config
794
- config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
795
- else:
796
- # Default config
797
- # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
798
- config = {
799
- "BLOCK_SIZE_M": 64,
800
- "BLOCK_SIZE_N": block_size[0],
801
- "BLOCK_SIZE_K": block_size[1],
802
- "GROUP_SIZE_M": 32,
803
- "num_warps": 4,
804
- "num_stages": 3,
805
- }
806
-
807
- def grid(META):
808
- return (
809
- triton.cdiv(M, META["BLOCK_SIZE_M"])
810
- * triton.cdiv(N, META["BLOCK_SIZE_N"]),
811
- )
830
+ # Default config
831
+ # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
832
+ config = {
833
+ "BLOCK_SIZE_M": 64,
834
+ "BLOCK_SIZE_N": block_size[0],
835
+ "BLOCK_SIZE_K": block_size[1],
836
+ "GROUP_SIZE_M": 32,
837
+ "num_warps": 4,
838
+ "num_stages": 3,
839
+ }
840
+
841
+ def grid(META):
842
+ return (
843
+ triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
844
+ )
812
845
 
813
- kernel = select_w8a8_block_fp8_matmul_kernel(M, N, config)
846
+ kernel = select_w8a8_block_fp8_matmul_kernel(M, N, config)
814
847
 
815
- kernel[grid](
816
- A,
817
- B,
818
- C,
819
- As,
820
- Bs,
821
- M,
822
- N,
823
- K,
824
- block_n,
825
- block_k,
826
- A.stride(-2),
827
- A.stride(-1),
828
- B.stride(1),
829
- B.stride(0),
830
- C.stride(-2),
831
- C.stride(-1),
832
- As.stride(-2),
833
- As.stride(-1),
834
- Bs.stride(1),
835
- Bs.stride(0),
836
- **config,
837
- )
848
+ kernel[grid](
849
+ A,
850
+ B,
851
+ C,
852
+ As,
853
+ Bs,
854
+ M,
855
+ N,
856
+ K,
857
+ block_n,
858
+ block_k,
859
+ A.stride(-2),
860
+ A.stride(-1),
861
+ B.stride(1),
862
+ B.stride(0),
863
+ C.stride(-2),
864
+ C.stride(-1),
865
+ As.stride(-2),
866
+ As.stride(-1),
867
+ Bs.stride(1),
868
+ Bs.stride(0),
869
+ **config,
870
+ )
838
871
 
839
872
  return C
840
873
 
841
874
 
875
+ # universal entry point, for testing purposes
876
+ def w8a8_block_fp8_matmul(
877
+ A: torch.Tensor,
878
+ B: torch.Tensor,
879
+ As: torch.Tensor,
880
+ Bs: torch.Tensor,
881
+ block_size: List[int],
882
+ output_dtype: torch.dtype = torch.float16,
883
+ ) -> torch.Tensor:
884
+ if output_dtype == torch.bfloat16 and _ENABLE_JIT_DEEPGEMM:
885
+ return w8a8_block_fp8_matmul_deepgemm(
886
+ A, B, As, Bs, block_size, output_dtype=output_dtype
887
+ )
888
+
889
+ return w8a8_block_fp8_matmul_triton(
890
+ A, B, As, Bs, block_size, output_dtype=output_dtype
891
+ )
892
+
893
+
842
894
  @triton.jit
843
895
  def _per_tensor_quant_mla_fp8_stage1(
844
896
  x_ptr,
@@ -1,9 +1,11 @@
1
1
  import os
2
- from typing import List, Optional, Tuple
2
+ from curses import flash
3
+ from typing import Callable, List, Optional, Tuple
3
4
 
4
5
  import torch
5
6
 
6
7
  from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8
8
+ from sglang.srt.layers.utils import is_sm100_supported
7
9
 
8
10
  try:
9
11
  from vllm import _custom_ops as ops
@@ -21,13 +23,15 @@ from sglang.srt.layers.quantization.fp8_kernel import (
21
23
  scaled_fp8_quant,
22
24
  sglang_per_token_quant_fp8,
23
25
  static_quant_fp8,
24
- w8a8_block_fp8_matmul,
26
+ w8a8_block_fp8_matmul_deepgemm,
27
+ w8a8_block_fp8_matmul_triton,
25
28
  )
26
29
  from sglang.srt.utils import (
27
30
  get_bool_env_var,
28
31
  get_cuda_version,
29
32
  get_device_capability,
30
33
  is_cuda,
34
+ is_flashinfer_available,
31
35
  is_hip,
32
36
  )
33
37
 
@@ -35,10 +39,10 @@ _is_hip = is_hip()
35
39
  _is_cuda = is_cuda()
36
40
  _is_fp8_fnuz = is_fp8_fnuz()
37
41
 
38
- use_aiter_moe = get_bool_env_var("SGLANG_AITER_MOE")
42
+ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
39
43
 
40
- if _is_hip and use_aiter_moe:
41
- from aiter import gemm_a8w8_blockscale
44
+ if _use_aiter:
45
+ from aiter import gemm_a8w8_blockscale_CK
42
46
 
43
47
  if _is_cuda:
44
48
  from sgl_kernel import fp8_blockwise_scaled_mm, fp8_scaled_mm
@@ -80,12 +84,6 @@ def cutlass_fp8_supported():
80
84
  return False
81
85
 
82
86
 
83
- def is_sm100_supported(device=None) -> bool:
84
- return (torch.cuda.get_device_capability(device)[0] == 10) and (
85
- torch.version.cuda >= "12.8"
86
- )
87
-
88
-
89
87
  def normalize_e4m3fn_to_e4m3fnuz(
90
88
  weight: torch.Tensor,
91
89
  weight_scale: torch.Tensor,
@@ -111,7 +109,7 @@ def normalize_e4m3fn_to_e4m3fnuz(
111
109
 
112
110
 
113
111
  def cutlass_block_fp8_supported() -> bool:
114
- if not get_bool_env_var("SUPPORT_CUTLASS_BLOCK_FP8"):
112
+ if not get_bool_env_var("SGLANG_SUPPORT_CUTLASS_BLOCK_FP8"):
115
113
  return False
116
114
  if _is_cuda:
117
115
  major, minor = torch.cuda.get_device_capability()
@@ -123,9 +121,29 @@ def cutlass_block_fp8_supported() -> bool:
123
121
 
124
122
 
125
123
  CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported()
124
+ ENABLE_FLASHINFER_GEMM = (
125
+ get_bool_env_var("SGLANG_ENABLE_FLASHINFER_GEMM")
126
+ and is_sm100_supported()
127
+ and is_flashinfer_available()
128
+ )
129
+ if ENABLE_FLASHINFER_GEMM:
130
+ from flashinfer.gemm import gemm_fp8_nt_groupwise
131
+
132
+
133
+ def dispatch_w8a8_block_fp8_linear() -> Callable:
134
+ if ENABLE_FLASHINFER_GEMM:
135
+ return flashinfer_gemm_w8a8_block_fp8_linear
136
+ elif CUTLASS_BLOCK_FP8_SUPPORTED:
137
+ return cutlass_w8a8_block_fp8_linear_with_fallback
138
+ elif _use_aiter:
139
+ return aiter_w8a8_block_fp8_linear
140
+ elif _ENABLE_JIT_DEEPGEMM:
141
+ return deepgemm_w8a8_block_fp8_linear_with_fallback
142
+ else:
143
+ return triton_w8a8_block_fp8_linear
126
144
 
127
145
 
128
- def apply_w8a8_block_fp8_linear(
146
+ def flashinfer_gemm_w8a8_block_fp8_linear(
129
147
  input: torch.Tensor,
130
148
  weight: torch.Tensor,
131
149
  block_size: List[int],
@@ -134,49 +152,147 @@ def apply_w8a8_block_fp8_linear(
134
152
  bias: Optional[torch.Tensor] = None,
135
153
  ) -> torch.Tensor:
136
154
  assert input_scale is None
137
- # View input as 2D matrix for fp8 methods
155
+
138
156
  input_2d = input.view(-1, input.shape[-1])
139
157
  output_shape = [*input.shape[:-1], weight.shape[0]]
140
- # TODO: add more robust shape check here
141
- shape_supported_by_cutlass = (
142
- weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
158
+
159
+ q_input, x_scale = sglang_per_token_group_quant_fp8(
160
+ input_2d, block_size[1], column_major_scales=False
143
161
  )
144
- if CUTLASS_BLOCK_FP8_SUPPORTED and shape_supported_by_cutlass:
145
- q_input, x_scale = per_token_group_quant_fp8(
146
- input_2d, block_size[1], column_major_scales=True
147
- )
148
- output = fp8_blockwise_scaled_mm(
149
- q_input, weight.T, x_scale, weight_scale.T, out_dtype=input.dtype
150
- )
151
- elif _is_hip and use_aiter_moe:
152
- q_input, x_scale = per_token_group_quant_fp8(
153
- input_2d, block_size[1], column_major_scales=False
154
- )
155
- output = torch.zeros(
156
- [q_input.shape[0], weight.shape[0]],
157
- dtype=input.dtype,
158
- device=q_input.device,
162
+
163
+ output = gemm_fp8_nt_groupwise(
164
+ q_input,
165
+ weight,
166
+ x_scale,
167
+ weight_scale,
168
+ scale_major_mode="K",
169
+ out_dtype=input_2d.dtype,
170
+ )
171
+
172
+ if bias is not None:
173
+ output += bias
174
+
175
+ return output.to(dtype=input_2d.dtype).view(*output_shape)
176
+
177
+
178
+ def cutlass_w8a8_block_fp8_linear_with_fallback(
179
+ input: torch.Tensor,
180
+ weight: torch.Tensor,
181
+ block_size: List[int],
182
+ weight_scale: torch.Tensor,
183
+ input_scale: Optional[torch.Tensor] = None,
184
+ bias: Optional[torch.Tensor] = None,
185
+ ) -> torch.Tensor:
186
+ assert input_scale is None
187
+
188
+ # TODO: add more robust shape check here
189
+ shape_supported = weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
190
+
191
+ if not shape_supported:
192
+ # fallback to triton
193
+ return triton_w8a8_block_fp8_linear(
194
+ input, weight, block_size, weight_scale, input_scale, bias
159
195
  )
160
- gemm_a8w8_blockscale(q_input, weight, x_scale, weight_scale, output)
161
- else:
162
- if _ENABLE_JIT_DEEPGEMM:
163
- q_input, x_scale = sglang_per_token_group_quant_fp8(
164
- input_2d,
165
- block_size[1],
166
- column_major_scales=True,
167
- scale_tma_aligned=True,
168
- )
169
- else:
170
- q_input, x_scale = per_token_group_quant_fp8(
171
- input_2d, block_size[1], column_major_scales=False
172
- )
173
- output = w8a8_block_fp8_matmul(
174
- q_input, weight, x_scale, weight_scale, block_size, output_dtype=input.dtype
196
+
197
+ input_2d = input.view(-1, input.shape[-1])
198
+ output_shape = [*input.shape[:-1], weight.shape[0]]
199
+
200
+ q_input, x_scale = per_token_group_quant_fp8(
201
+ input_2d, block_size[1], column_major_scales=True
202
+ )
203
+ output = fp8_blockwise_scaled_mm(
204
+ q_input, weight.T, x_scale, weight_scale.T, out_dtype=input_2d.dtype
205
+ )
206
+ if bias is not None:
207
+ output += bias
208
+ return output.to(dtype=input_2d.dtype).view(*output_shape)
209
+
210
+
211
+ def deepgemm_w8a8_block_fp8_linear_with_fallback(
212
+ input: torch.Tensor,
213
+ weight: torch.Tensor,
214
+ block_size: List[int],
215
+ weight_scale: torch.Tensor,
216
+ input_scale: Optional[torch.Tensor] = None,
217
+ bias: Optional[torch.Tensor] = None,
218
+ ) -> torch.Tensor:
219
+ assert input_scale is None
220
+
221
+ output_dtype = input.dtype
222
+ dtype_supported = output_dtype == torch.bfloat16
223
+
224
+ # TODO: https://github.com/sgl-project/sglang/pull/6890#issuecomment-2943395737
225
+ shape_supported = weight.shape[0] % 64 == 0 and weight.shape[1] % 128 == 0
226
+
227
+ if not (shape_supported and dtype_supported):
228
+ # fall back to triton
229
+ return triton_w8a8_block_fp8_linear(
230
+ input, weight, block_size, weight_scale, input_scale, bias
175
231
  )
176
232
 
233
+ input_2d = input.view(-1, input.shape[-1])
234
+ output_shape = [*input.shape[:-1], weight.shape[0]]
235
+
236
+ q_input, x_scale = sglang_per_token_group_quant_fp8(
237
+ input_2d,
238
+ block_size[1],
239
+ column_major_scales=True,
240
+ scale_tma_aligned=True,
241
+ )
242
+ output = w8a8_block_fp8_matmul_deepgemm(
243
+ q_input, weight, x_scale, weight_scale, block_size, output_dtype=output_dtype
244
+ )
177
245
  if bias is not None:
178
- output = output + bias
179
- return output.to(dtype=input.dtype).view(*output_shape)
246
+ output += bias
247
+ return output.to(dtype=output_dtype).view(*output_shape)
248
+
249
+
250
+ def aiter_w8a8_block_fp8_linear(
251
+ input: torch.Tensor,
252
+ weight: torch.Tensor,
253
+ block_size: List[int],
254
+ weight_scale: torch.Tensor,
255
+ input_scale: Optional[torch.Tensor] = None,
256
+ bias: Optional[torch.Tensor] = None,
257
+ ) -> torch.Tensor:
258
+ assert input_scale is None
259
+ input_2d = input.view(-1, input.shape[-1])
260
+ output_shape = [*input.shape[:-1], weight.shape[0]]
261
+
262
+ q_input, x_scale = per_token_group_quant_fp8(
263
+ input_2d, block_size[1], column_major_scales=False
264
+ )
265
+ output = gemm_a8w8_blockscale_CK(
266
+ q_input, weight, x_scale, weight_scale, dtype=input.dtype
267
+ )
268
+
269
+ if bias is not None:
270
+ output += bias
271
+
272
+ return output.to(dtype=input_2d.dtype).view(*output_shape)
273
+
274
+
275
+ def triton_w8a8_block_fp8_linear(
276
+ input: torch.Tensor,
277
+ weight: torch.Tensor,
278
+ block_size: List[int],
279
+ weight_scale: torch.Tensor,
280
+ input_scale: Optional[torch.Tensor] = None,
281
+ bias: Optional[torch.Tensor] = None,
282
+ ) -> torch.Tensor:
283
+ assert input_scale is None
284
+ input_2d = input.view(-1, input.shape[-1])
285
+ output_shape = [*input.shape[:-1], weight.shape[0]]
286
+
287
+ q_input, x_scale = per_token_group_quant_fp8(
288
+ input_2d, block_size[1], column_major_scales=False
289
+ )
290
+ output = w8a8_block_fp8_matmul_triton(
291
+ q_input, weight, x_scale, weight_scale, block_size, output_dtype=input_2d.dtype
292
+ )
293
+ if bias is not None:
294
+ output += bias
295
+ return output.to(dtype=input_2d.dtype).view(*output_shape)
180
296
 
181
297
 
182
298
  def input_to_float8(