sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -23,7 +23,8 @@ import torch
23
23
  import triton
24
24
  import triton.language as tl
25
25
 
26
- from sglang.srt.layers.quantization.deep_gemm import _ENABLE_JIT_DEEPGEMM
26
+ from sglang.math_utils import align
27
+ from sglang.srt.layers.quantization import deep_gemm_wrapper
27
28
  from sglang.srt.utils import (
28
29
  direct_register_custom_op,
29
30
  get_device_core_count,
@@ -44,10 +45,6 @@ if _is_cuda:
44
45
  sgl_per_token_quant_fp8,
45
46
  )
46
47
 
47
- from sglang.srt.layers.quantization.deep_gemm import (
48
- gemm_nt_f8f8bf16 as deep_gemm_gemm_nt_f8f8bf16,
49
- )
50
-
51
48
  logger = logging.getLogger(__name__)
52
49
 
53
50
 
@@ -67,7 +64,6 @@ else:
67
64
  fp8_max = torch.finfo(fp8_dtype).max
68
65
  fp8_min = -fp8_max
69
66
 
70
-
71
67
  if supports_custom_op():
72
68
 
73
69
  def deep_gemm_fp8_fp8_bf16_nt(
@@ -77,7 +73,7 @@ if supports_custom_op():
77
73
  Bs: torch.Tensor,
78
74
  C: torch.Tensor,
79
75
  ) -> None:
80
- deep_gemm_gemm_nt_f8f8bf16((A, As), (B, Bs), C)
76
+ deep_gemm_wrapper.gemm_nt_f8f8bf16((A, As), (B, Bs), C)
81
77
 
82
78
  def deep_gemm_fp8_fp8_bf16_nt_fake(
83
79
  A: torch.Tensor,
@@ -280,6 +276,7 @@ def sglang_per_token_group_quant_fp8(
280
276
  eps: float = 1e-10,
281
277
  column_major_scales: bool = False,
282
278
  scale_tma_aligned: bool = False,
279
+ scale_ue8m0: bool = False,
283
280
  ):
284
281
  assert (
285
282
  x.shape[-1] % group_size == 0
@@ -287,8 +284,21 @@ def sglang_per_token_group_quant_fp8(
287
284
  assert x.is_contiguous(), "`x` is not contiguous"
288
285
 
289
286
  x_q = torch.empty_like(x, device=x.device, dtype=fp8_dtype)
290
- if column_major_scales:
287
+ if scale_ue8m0:
288
+ assert column_major_scales and scale_tma_aligned
289
+ x_q_mn, x_q_k = x.shape
290
+ x_s_mn, x_s_k = x_q_mn, x_q_k // 128
291
+ aligned_mn = align(x_s_mn, 4)
292
+ aligned_k = align(x_s_k, 4)
293
+ # TODO(FIXME): Fix cuda kernel and recover here to empty.
294
+ x_s = torch.zeros(
295
+ (aligned_k // 4, aligned_mn),
296
+ device=x.device,
297
+ dtype=torch.int,
298
+ ).transpose(0, 1)[:x_s_mn, :]
299
+ elif column_major_scales:
291
300
  if scale_tma_aligned:
301
+ # TODO extract "align" function
292
302
  # aligned to 4 * sizeof(float)
293
303
  aligned_size = (x.shape[-2] + 3) // 4 * 4
294
304
  x_s = torch.empty(
@@ -309,7 +319,9 @@ def sglang_per_token_group_quant_fp8(
309
319
  dtype=torch.float32,
310
320
  )
311
321
  if x.shape[0] > 0:
312
- sgl_per_token_group_quant_fp8(x, x_q, x_s, group_size, eps, fp8_min, fp8_max)
322
+ sgl_per_token_group_quant_fp8(
323
+ x, x_q, x_s, group_size, eps, fp8_min, fp8_max, scale_ue8m0
324
+ )
313
325
 
314
326
  return x_q, x_s
315
327
 
@@ -740,7 +752,76 @@ if _is_hip:
740
752
  return _w8a8_block_fp8_matmul
741
753
 
742
754
 
743
- def w8a8_block_fp8_matmul(
755
+ def prepare_block_fp8_matmul_inputs(
756
+ A: torch.Tensor,
757
+ B: torch.Tensor,
758
+ As: torch.Tensor,
759
+ Bs: torch.Tensor,
760
+ block_size: List[int],
761
+ output_dtype: torch.dtype = torch.float16,
762
+ ) -> Tuple[int, int, int]:
763
+ assert len(block_size) == 2
764
+ block_n, block_k = block_size[0], block_size[1]
765
+
766
+ assert A.shape[-1] == B.shape[-1]
767
+ assert A.shape[:-1] == As.shape[:-1]
768
+ assert A.is_contiguous()
769
+
770
+ if As.dtype == torch.float:
771
+ assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
772
+ elif As.dtype == torch.int:
773
+ assert (
774
+ triton.cdiv(triton.cdiv(A.shape[-1], block_k), 4) == As.shape[-1]
775
+ ), f"{A.shape=} {As.shape=} {block_size=}"
776
+ else:
777
+ raise NotImplementedError
778
+
779
+ M = A.numel() // A.shape[-1]
780
+
781
+ assert B.ndim == 2
782
+ assert B.is_contiguous()
783
+ assert Bs.ndim == 2
784
+ N, K = B.shape
785
+
786
+ if Bs.dtype == torch.float:
787
+ assert triton.cdiv(N, block_n) == Bs.shape[0]
788
+ assert triton.cdiv(K, block_k) == Bs.shape[1]
789
+ elif Bs.dtype == torch.int:
790
+ assert N == Bs.shape[0], f"{B.shape=} {Bs.shape=} {block_size=}"
791
+ assert (
792
+ triton.cdiv(triton.cdiv(K, block_k), 4) == Bs.shape[1]
793
+ ), f"{B.shape=} {Bs.shape=} {block_size=}"
794
+ else:
795
+ raise NotImplementedError
796
+
797
+ C_shape = A.shape[:-1] + (N,)
798
+ C = A.new_empty(C_shape, dtype=output_dtype)
799
+
800
+ return M, N, K, C
801
+
802
+
803
+ def w8a8_block_fp8_matmul_deepgemm(
804
+ A: torch.Tensor,
805
+ B: torch.Tensor,
806
+ As: torch.Tensor,
807
+ Bs: torch.Tensor,
808
+ block_size: List[int],
809
+ output_dtype: torch.dtype,
810
+ ) -> torch.Tensor:
811
+ M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, output_dtype)
812
+
813
+ # Deepgemm only supports output tensor type as bfloat16
814
+ assert C.dtype == torch.bfloat16 and deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
815
+
816
+ if supports_custom_op():
817
+ torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
818
+ else:
819
+ deep_gemm_wrapper.gemm_nt_f8f8bf16((A, As), (B, Bs), C)
820
+
821
+ return C
822
+
823
+
824
+ def w8a8_block_fp8_matmul_triton(
744
825
  A: torch.Tensor,
745
826
  B: torch.Tensor,
746
827
  As: torch.Tensor,
@@ -764,81 +845,81 @@ def w8a8_block_fp8_matmul(
764
845
  Returns:
765
846
  torch.Tensor: The result of matmul.
766
847
  """
767
- assert len(block_size) == 2
768
- block_n, block_k = block_size[0], block_size[1]
769
848
 
770
- assert A.shape[-1] == B.shape[-1]
771
- assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
772
- assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
773
- M = A.numel() // A.shape[-1]
849
+ M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, output_dtype)
774
850
 
775
- assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
776
- N, K = B.shape
777
- assert triton.cdiv(N, block_n) == Bs.shape[0]
778
- assert triton.cdiv(K, block_k) == Bs.shape[1]
851
+ block_n, block_k = block_size
779
852
 
780
- C_shape = A.shape[:-1] + (N,)
781
- C = A.new_empty(C_shape, dtype=output_dtype)
782
-
783
- # deepgemm only support bf16
784
- if C.dtype == torch.bfloat16 and _ENABLE_JIT_DEEPGEMM:
785
- if supports_custom_op():
786
- torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
787
- else:
788
- deep_gemm_gemm_nt_f8f8bf16((A, As), (B, Bs), C)
853
+ configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
854
+ if configs:
855
+ # If an optimal configuration map has been found, look up the
856
+ # optimal config
857
+ config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
789
858
  else:
790
- configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
791
- if configs:
792
- # If an optimal configuration map has been found, look up the
793
- # optimal config
794
- config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
795
- else:
796
- # Default config
797
- # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
798
- config = {
799
- "BLOCK_SIZE_M": 64,
800
- "BLOCK_SIZE_N": block_size[0],
801
- "BLOCK_SIZE_K": block_size[1],
802
- "GROUP_SIZE_M": 32,
803
- "num_warps": 4,
804
- "num_stages": 3,
805
- }
806
-
807
- def grid(META):
808
- return (
809
- triton.cdiv(M, META["BLOCK_SIZE_M"])
810
- * triton.cdiv(N, META["BLOCK_SIZE_N"]),
811
- )
859
+ # Default config
860
+ # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
861
+ config = {
862
+ "BLOCK_SIZE_M": 64,
863
+ "BLOCK_SIZE_N": block_size[0],
864
+ "BLOCK_SIZE_K": block_size[1],
865
+ "GROUP_SIZE_M": 32,
866
+ "num_warps": 4,
867
+ "num_stages": 3,
868
+ }
869
+
870
+ def grid(META):
871
+ return (
872
+ triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
873
+ )
812
874
 
813
- kernel = select_w8a8_block_fp8_matmul_kernel(M, N, config)
875
+ kernel = select_w8a8_block_fp8_matmul_kernel(M, N, config)
814
876
 
815
- kernel[grid](
816
- A,
817
- B,
818
- C,
819
- As,
820
- Bs,
821
- M,
822
- N,
823
- K,
824
- block_n,
825
- block_k,
826
- A.stride(-2),
827
- A.stride(-1),
828
- B.stride(1),
829
- B.stride(0),
830
- C.stride(-2),
831
- C.stride(-1),
832
- As.stride(-2),
833
- As.stride(-1),
834
- Bs.stride(1),
835
- Bs.stride(0),
836
- **config,
837
- )
877
+ kernel[grid](
878
+ A,
879
+ B,
880
+ C,
881
+ As,
882
+ Bs,
883
+ M,
884
+ N,
885
+ K,
886
+ block_n,
887
+ block_k,
888
+ A.stride(-2),
889
+ A.stride(-1),
890
+ B.stride(1),
891
+ B.stride(0),
892
+ C.stride(-2),
893
+ C.stride(-1),
894
+ As.stride(-2),
895
+ As.stride(-1),
896
+ Bs.stride(1),
897
+ Bs.stride(0),
898
+ **config,
899
+ )
838
900
 
839
901
  return C
840
902
 
841
903
 
904
+ # universal entry point, for testing purposes
905
+ def w8a8_block_fp8_matmul(
906
+ A: torch.Tensor,
907
+ B: torch.Tensor,
908
+ As: torch.Tensor,
909
+ Bs: torch.Tensor,
910
+ block_size: List[int],
911
+ output_dtype: torch.dtype = torch.float16,
912
+ ) -> torch.Tensor:
913
+ if output_dtype == torch.bfloat16 and deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
914
+ return w8a8_block_fp8_matmul_deepgemm(
915
+ A, B, As, Bs, block_size, output_dtype=output_dtype
916
+ )
917
+
918
+ return w8a8_block_fp8_matmul_triton(
919
+ A, B, As, Bs, block_size, output_dtype=output_dtype
920
+ )
921
+
922
+
842
923
  @triton.jit
843
924
  def _per_tensor_quant_mla_fp8_stage1(
844
925
  x_ptr,
@@ -1,9 +1,12 @@
1
- import os
2
- from typing import List, Optional, Tuple
1
+ from typing import Callable, List, Optional, Tuple
3
2
 
3
+ import einops
4
4
  import torch
5
5
 
6
+ from sglang.math_utils import align
7
+ from sglang.srt.layers.quantization import deep_gemm_wrapper
6
8
  from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8
9
+ from sglang.srt.layers.utils import is_sm100_supported
7
10
 
8
11
  try:
9
12
  from vllm import _custom_ops as ops
@@ -12,7 +15,6 @@ try:
12
15
  except ImportError:
13
16
  VLLM_AVAILABLE = False
14
17
 
15
- from sglang.srt.layers.quantization.deep_gemm import _ENABLE_JIT_DEEPGEMM
16
18
  from sglang.srt.layers.quantization.fp8_kernel import (
17
19
  fp8_dtype,
18
20
  fp8_max,
@@ -21,13 +23,15 @@ from sglang.srt.layers.quantization.fp8_kernel import (
21
23
  scaled_fp8_quant,
22
24
  sglang_per_token_quant_fp8,
23
25
  static_quant_fp8,
24
- w8a8_block_fp8_matmul,
26
+ w8a8_block_fp8_matmul_deepgemm,
27
+ w8a8_block_fp8_matmul_triton,
25
28
  )
26
29
  from sglang.srt.utils import (
27
30
  get_bool_env_var,
28
31
  get_cuda_version,
29
32
  get_device_capability,
30
33
  is_cuda,
34
+ is_flashinfer_available,
31
35
  is_hip,
32
36
  )
33
37
 
@@ -35,10 +39,10 @@ _is_hip = is_hip()
35
39
  _is_cuda = is_cuda()
36
40
  _is_fp8_fnuz = is_fp8_fnuz()
37
41
 
38
- use_aiter_moe = get_bool_env_var("SGLANG_AITER_MOE")
42
+ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
39
43
 
40
- if _is_hip and use_aiter_moe:
41
- from aiter import gemm_a8w8_blockscale
44
+ if _use_aiter:
45
+ from aiter import gemm_a8w8_blockscale_CK
42
46
 
43
47
  if _is_cuda:
44
48
  from sgl_kernel import fp8_blockwise_scaled_mm, fp8_scaled_mm
@@ -80,12 +84,6 @@ def cutlass_fp8_supported():
80
84
  return False
81
85
 
82
86
 
83
- def is_sm100_supported(device=None) -> bool:
84
- return (torch.cuda.get_device_capability(device)[0] == 10) and (
85
- torch.version.cuda >= "12.8"
86
- )
87
-
88
-
89
87
  def normalize_e4m3fn_to_e4m3fnuz(
90
88
  weight: torch.Tensor,
91
89
  weight_scale: torch.Tensor,
@@ -111,7 +109,7 @@ def normalize_e4m3fn_to_e4m3fnuz(
111
109
 
112
110
 
113
111
  def cutlass_block_fp8_supported() -> bool:
114
- if not get_bool_env_var("SUPPORT_CUTLASS_BLOCK_FP8"):
112
+ if not get_bool_env_var("SGLANG_SUPPORT_CUTLASS_BLOCK_FP8"):
115
113
  return False
116
114
  if _is_cuda:
117
115
  major, minor = torch.cuda.get_device_capability()
@@ -123,9 +121,29 @@ def cutlass_block_fp8_supported() -> bool:
123
121
 
124
122
 
125
123
  CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported()
124
+ ENABLE_FLASHINFER_GEMM = (
125
+ get_bool_env_var("SGLANG_ENABLE_FLASHINFER_GEMM")
126
+ and is_sm100_supported()
127
+ and is_flashinfer_available()
128
+ )
129
+ if ENABLE_FLASHINFER_GEMM:
130
+ from flashinfer.gemm import gemm_fp8_nt_groupwise
131
+
132
+
133
+ def dispatch_w8a8_block_fp8_linear() -> Callable:
134
+ if ENABLE_FLASHINFER_GEMM:
135
+ return flashinfer_gemm_w8a8_block_fp8_linear
136
+ elif CUTLASS_BLOCK_FP8_SUPPORTED:
137
+ return cutlass_w8a8_block_fp8_linear_with_fallback
138
+ elif _use_aiter:
139
+ return aiter_w8a8_block_fp8_linear
140
+ elif deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
141
+ return deepgemm_w8a8_block_fp8_linear_with_fallback
142
+ else:
143
+ return triton_w8a8_block_fp8_linear
126
144
 
127
145
 
128
- def apply_w8a8_block_fp8_linear(
146
+ def flashinfer_gemm_w8a8_block_fp8_linear(
129
147
  input: torch.Tensor,
130
148
  weight: torch.Tensor,
131
149
  block_size: List[int],
@@ -134,49 +152,159 @@ def apply_w8a8_block_fp8_linear(
134
152
  bias: Optional[torch.Tensor] = None,
135
153
  ) -> torch.Tensor:
136
154
  assert input_scale is None
137
- # View input as 2D matrix for fp8 methods
155
+
138
156
  input_2d = input.view(-1, input.shape[-1])
139
157
  output_shape = [*input.shape[:-1], weight.shape[0]]
140
- # TODO: add more robust shape check here
141
- shape_supported_by_cutlass = (
142
- weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
158
+
159
+ q_input, x_scale = sglang_per_token_group_quant_fp8(
160
+ input_2d, block_size[1], column_major_scales=False
143
161
  )
144
- if CUTLASS_BLOCK_FP8_SUPPORTED and shape_supported_by_cutlass:
145
- q_input, x_scale = per_token_group_quant_fp8(
146
- input_2d, block_size[1], column_major_scales=True
147
- )
148
- output = fp8_blockwise_scaled_mm(
149
- q_input, weight.T, x_scale, weight_scale.T, out_dtype=input.dtype
150
- )
151
- elif _is_hip and use_aiter_moe:
152
- q_input, x_scale = per_token_group_quant_fp8(
153
- input_2d, block_size[1], column_major_scales=False
154
- )
155
- output = torch.zeros(
156
- [q_input.shape[0], weight.shape[0]],
157
- dtype=input.dtype,
158
- device=q_input.device,
162
+
163
+ output = gemm_fp8_nt_groupwise(
164
+ q_input,
165
+ weight,
166
+ x_scale,
167
+ weight_scale,
168
+ scale_major_mode="K",
169
+ out_dtype=input_2d.dtype,
170
+ )
171
+
172
+ if bias is not None:
173
+ output += bias
174
+
175
+ return output.to(dtype=input_2d.dtype).view(*output_shape)
176
+
177
+
178
+ def cutlass_w8a8_block_fp8_linear_with_fallback(
179
+ input: torch.Tensor,
180
+ weight: torch.Tensor,
181
+ block_size: List[int],
182
+ weight_scale: torch.Tensor,
183
+ input_scale: Optional[torch.Tensor] = None,
184
+ bias: Optional[torch.Tensor] = None,
185
+ ) -> torch.Tensor:
186
+ assert input_scale is None
187
+
188
+ # TODO: add more robust shape check here
189
+ shape_supported = weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
190
+
191
+ if not shape_supported:
192
+ # fallback to triton
193
+ return triton_w8a8_block_fp8_linear(
194
+ input, weight, block_size, weight_scale, input_scale, bias
159
195
  )
160
- gemm_a8w8_blockscale(q_input, weight, x_scale, weight_scale, output)
161
- else:
162
- if _ENABLE_JIT_DEEPGEMM:
163
- q_input, x_scale = sglang_per_token_group_quant_fp8(
164
- input_2d,
165
- block_size[1],
166
- column_major_scales=True,
167
- scale_tma_aligned=True,
168
- )
169
- else:
170
- q_input, x_scale = per_token_group_quant_fp8(
171
- input_2d, block_size[1], column_major_scales=False
172
- )
173
- output = w8a8_block_fp8_matmul(
174
- q_input, weight, x_scale, weight_scale, block_size, output_dtype=input.dtype
196
+
197
+ input_2d = input.view(-1, input.shape[-1])
198
+ output_shape = [*input.shape[:-1], weight.shape[0]]
199
+
200
+ q_input, x_scale = per_token_group_quant_fp8(
201
+ input_2d, block_size[1], column_major_scales=True
202
+ )
203
+ output = fp8_blockwise_scaled_mm(
204
+ q_input, weight.T, x_scale, weight_scale.T, out_dtype=input_2d.dtype
205
+ )
206
+ if bias is not None:
207
+ output += bias
208
+ return output.to(dtype=input_2d.dtype).view(*output_shape)
209
+
210
+
211
+ def deepgemm_w8a8_block_fp8_linear_with_fallback(
212
+ input: torch.Tensor,
213
+ weight: torch.Tensor,
214
+ block_size: List[int],
215
+ weight_scale: torch.Tensor,
216
+ input_scale: Optional[torch.Tensor] = None,
217
+ bias: Optional[torch.Tensor] = None,
218
+ ) -> torch.Tensor:
219
+ assert input_scale is None
220
+
221
+ output_dtype = input.dtype
222
+ dtype_supported = output_dtype == torch.bfloat16
223
+
224
+ # TODO: https://github.com/sgl-project/sglang/pull/6890#issuecomment-2943395737
225
+ shape_supported = weight.shape[0] % 64 == 0 and weight.shape[1] % 128 == 0
226
+
227
+ if not (shape_supported and dtype_supported):
228
+ # fall back to triton
229
+ return triton_w8a8_block_fp8_linear(
230
+ input, weight, block_size, weight_scale, input_scale, bias
175
231
  )
176
232
 
233
+ input_2d = input.view(-1, input.shape[-1])
234
+ output_shape = [*input.shape[:-1], weight.shape[0]]
235
+
236
+ q_input, x_scale = sglang_per_token_group_quant_fp8(
237
+ input_2d,
238
+ block_size[1],
239
+ column_major_scales=True,
240
+ scale_tma_aligned=True,
241
+ scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
242
+ )
243
+
244
+ # NOTE(alcanderian): Useless when scale is packed to int32
245
+ # if get_bool_env_var("SGLANG_W8A8_DEEPGEMM_SANITY_CHECK_UE8M0"):
246
+ # _check_ue8m0("x_scale", x_scale)
247
+ # _check_ue8m0("weight_scale", ws)
248
+
249
+ output = w8a8_block_fp8_matmul_deepgemm(
250
+ q_input, weight, x_scale, weight_scale, block_size, output_dtype=output_dtype
251
+ )
177
252
  if bias is not None:
178
- output = output + bias
179
- return output.to(dtype=input.dtype).view(*output_shape)
253
+ output += bias
254
+ return output.to(dtype=output_dtype).view(*output_shape)
255
+
256
+
257
+ def _check_ue8m0(name, x):
258
+ x_ceil = ceil_to_ue8m0(x)
259
+ assert torch.all(x == x_ceil), f"{name=} {x=} {x_ceil=}"
260
+
261
+
262
+ def aiter_w8a8_block_fp8_linear(
263
+ input: torch.Tensor,
264
+ weight: torch.Tensor,
265
+ block_size: List[int],
266
+ weight_scale: torch.Tensor,
267
+ input_scale: Optional[torch.Tensor] = None,
268
+ bias: Optional[torch.Tensor] = None,
269
+ ) -> torch.Tensor:
270
+ assert input_scale is None
271
+ input_2d = input.view(-1, input.shape[-1])
272
+ output_shape = [*input.shape[:-1], weight.shape[0]]
273
+
274
+ q_input, x_scale = per_token_group_quant_fp8(
275
+ input_2d, block_size[1], column_major_scales=False
276
+ )
277
+ output = gemm_a8w8_blockscale_CK(
278
+ q_input, weight, x_scale, weight_scale, dtype=input.dtype
279
+ )
280
+
281
+ if bias is not None:
282
+ output += bias
283
+
284
+ return output.to(dtype=input_2d.dtype).view(*output_shape)
285
+
286
+
287
+ def triton_w8a8_block_fp8_linear(
288
+ input: torch.Tensor,
289
+ weight: torch.Tensor,
290
+ block_size: List[int],
291
+ weight_scale: torch.Tensor,
292
+ input_scale: Optional[torch.Tensor] = None,
293
+ bias: Optional[torch.Tensor] = None,
294
+ ) -> torch.Tensor:
295
+ assert input_scale is None
296
+ input_2d = input.view(-1, input.shape[-1])
297
+ output_shape = [*input.shape[:-1], weight.shape[0]]
298
+
299
+ q_input, x_scale = per_token_group_quant_fp8(
300
+ input_2d, block_size[1], column_major_scales=False
301
+ )
302
+ output = w8a8_block_fp8_matmul_triton(
303
+ q_input, weight, x_scale, weight_scale, block_size, output_dtype=input_2d.dtype
304
+ )
305
+ if bias is not None:
306
+ output += bias
307
+ return output.to(dtype=input_2d.dtype).view(*output_shape)
180
308
 
181
309
 
182
310
  def input_to_float8(
@@ -253,27 +381,80 @@ def block_quant_dequant(
253
381
  The output is an unquantized tensor with dtype.
254
382
  """
255
383
  block_n, block_k = block_size[0], block_size[1]
256
- n, k = x_q_block.shape
257
- n_tiles = (n + block_n - 1) // block_n
258
- k_tiles = (k + block_k - 1) // block_k
259
- assert n_tiles == x_s.shape[0]
260
- assert k_tiles == x_s.shape[1]
384
+ *_, n, k = x_q_block.shape
261
385
 
262
- x_dq_block = torch.empty_like(x_q_block, dtype=dtype)
386
+ # ... n_scale k_scale -> ... (n_scale block_n) (k_scale block_k)
387
+ x_scale_repeat = x_s.repeat_interleave(block_n, dim=-2).repeat_interleave(
388
+ block_k, dim=-1
389
+ )
390
+ x_scale_repeat = x_scale_repeat[..., :n, :k]
391
+
392
+ return (x_q_block.to(torch.float32) * x_scale_repeat).to(dtype)
393
+
394
+
395
+ def requant_weight_ue8m0_inplace(weight, weight_scale_inv, weight_block_size):
396
+ assert isinstance(weight, torch.nn.Parameter)
397
+ assert isinstance(weight_scale_inv, torch.nn.Parameter)
398
+ weight.data, weight_scale_inv.data = _requant_weight_ue8m0(
399
+ weight, weight_scale_inv, weight_block_size
400
+ )
401
+
402
+
403
+ def _requant_weight_ue8m0(
404
+ weight: torch.Tensor,
405
+ weight_scale_inv: torch.Tensor,
406
+ weight_block_size: List[int],
407
+ ):
408
+ assert weight_block_size == [128, 128]
409
+
410
+ *_, n, k = weight.shape
411
+
412
+ weight_dequant = block_quant_dequant(
413
+ weight,
414
+ weight_scale_inv,
415
+ weight_block_size,
416
+ torch.bfloat16,
417
+ )
418
+
419
+ weight_dequant_flat = weight_dequant.view((-1, k))
420
+ out_w_flat, out_s_flat = per_block_cast_to_fp8(weight_dequant_flat)
421
+
422
+ out_w = out_w_flat.view(weight.shape)
423
+ out_s = out_s_flat.view(weight_scale_inv.shape)
424
+
425
+ # NOTE copy and modified from DeepGEMM
426
+ def _transform_scale(sf, mn: int):
427
+ import deep_gemm.utils.layout
428
+
429
+ sf = sf.index_select(-2, torch.arange(mn, device=sf.device) // 128)
430
+ sf = deep_gemm.utils.layout.get_col_major_tma_aligned_packed_tensor(sf)
431
+ return sf
432
+
433
+ out_s = _transform_scale(out_s, mn=out_w.shape[-2])
434
+
435
+ return out_w, out_s
436
+
437
+
438
+ # COPIED FROM DeepGEMM
439
+ def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
440
+ assert x.dim() == 2
441
+ m, n = x.shape
442
+ x_padded = torch.zeros(
443
+ (align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
444
+ )
445
+ x_padded[:m, :n] = x
446
+ x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
447
+ x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
448
+ sf = ceil_to_ue8m0(x_amax / 448.0)
449
+ x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
450
+ return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
451
+ x_view.size(0), x_view.size(2)
452
+ )
263
453
 
264
- for j in range(n_tiles):
265
- for i in range(k_tiles):
266
- x_q_block_tile = x_q_block[
267
- j * block_n : min((j + 1) * block_n, n),
268
- i * block_k : min((i + 1) * block_k, k),
269
- ]
270
- x_dq_block_tile = x_dq_block[
271
- j * block_n : min((j + 1) * block_n, n),
272
- i * block_k : min((i + 1) * block_k, k),
273
- ]
274
- x_dq_block_tile[:, :] = x_q_block_tile.to(torch.float32) * x_s[j][i]
275
454
 
276
- return x_dq_block
455
+ # COPIED FROM DeepGEMM
456
+ def ceil_to_ue8m0(x: torch.Tensor):
457
+ return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
277
458
 
278
459
 
279
460
  def channel_quant_to_tensor_quant(