sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -5,33 +5,22 @@ from dataclasses import dataclass
5
5
  from enum import IntEnum, auto
6
6
  from typing import Callable, Dict, List, Optional, Tuple
7
7
 
8
- import torch
9
8
  from tqdm.contrib.concurrent import thread_map
10
9
 
10
+ from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
11
+ DEEPGEMM_BLACKWELL,
12
+ ENABLE_JIT_DEEPGEMM,
13
+ )
11
14
  from sglang.srt.server_args import ServerArgs
12
- from sglang.srt.utils import get_bool_env_var, get_device_sm, get_int_env_var, is_cuda
15
+ from sglang.srt.utils import get_bool_env_var, get_int_env_var
13
16
 
14
17
  logger = logging.getLogger(__name__)
15
- _ENABLE_JIT_DEEPGEMM = False
16
18
 
17
- try:
18
- import deep_gemm
19
+ if ENABLE_JIT_DEEPGEMM and not DEEPGEMM_BLACKWELL:
19
20
  from deep_gemm import get_num_sms
20
- from deep_gemm.jit.compiler import get_nvcc_compiler
21
+ from deep_gemm.jit import build
21
22
  from deep_gemm.jit_kernels.gemm import get_best_configs
22
23
  from deep_gemm.jit_kernels.runtime import FP8GemmRuntime, GemmType
23
- from deep_gemm.jit_kernels.tuner import jit_tuner
24
-
25
- sm_version = get_device_sm()
26
- if sm_version == 90:
27
- if get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true"):
28
- _ENABLE_JIT_DEEPGEMM = True
29
- except ImportError:
30
- logger.warning("Failed to import deepgemm, disable _ENABLE_JIT_DEEPGEMM.")
31
-
32
-
33
- def get_enable_jit_deepgemm():
34
- return _ENABLE_JIT_DEEPGEMM
35
24
 
36
25
 
37
26
  _BUILTIN_M_LIST = list(range(1, 1024 * 16 + 1))
@@ -52,8 +41,10 @@ os.environ["DG_JIT_CACHE_DIR"] = os.getenv(
52
41
  # NVRTC may have performance loss with some cases.
53
42
  # And NVCC JIT speed is also 9x faster in the ref commit
54
43
  _USE_NVRTC_DEFAULT = "0"
55
- if _ENABLE_JIT_DEEPGEMM:
44
+ if ENABLE_JIT_DEEPGEMM:
56
45
  try:
46
+ from deep_gemm.jit.compiler import get_nvcc_compiler
47
+
57
48
  get_nvcc_compiler()
58
49
  except:
59
50
  logger.warning(
@@ -114,11 +105,12 @@ class DeepGemmKernelHelper:
114
105
  _INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dict()
115
106
 
116
107
 
108
+ # TODO improve naming
117
109
  def _compile_warning_1():
118
110
  if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
119
111
  logger.warning(
120
112
  "Entering DeepGEMM JIT Pre-Compile session. "
121
- "And it may takes a long time(Typically 10-20 mins) "
113
+ "It may takes a long time (typically 10-20 mins) "
122
114
  "if you have not run `sglang.compile_deep_gemm`. "
123
115
  "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
124
116
  " for pre-compilation to reduce the overhead if you have not run it before. "
@@ -127,6 +119,7 @@ def _compile_warning_1():
127
119
  )
128
120
 
129
121
 
122
+ # TODO improve naming
130
123
  def _compile_warning_2():
131
124
  logger.warning(
132
125
  "Entering DeepGEMM JIT Single Kernel Compile session. "
@@ -148,32 +141,28 @@ def _compile_grouped_gemm_nt_f8f8bf16_masked_one(
148
141
  block_k = 128
149
142
  num_tma_threads = 128
150
143
  num_math_threads_per_group = 128
144
+
151
145
  kwargs = {
146
+ "GEMM_TYPE": GemmType.GroupedMasked,
152
147
  "NUM_TMA_THREADS": num_tma_threads,
153
148
  "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
149
+ "N": n,
150
+ "K": k,
151
+ "NUM_GROUPS": 1,
152
+ "BLOCK_M": block_m,
153
+ "BLOCK_N": block_n,
154
154
  "BLOCK_K": block_k,
155
+ "SWIZZLE_D_MODE": smem_config[1],
156
+ "BLOCK_N_PADDING": smem_config[2],
157
+ "NUM_STAGES": num_stages,
158
+ "NUM_TMA_MULTICAST": tma_multicast_config[0],
159
+ "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
155
160
  "NUM_SMS": num_sms,
156
161
  "SMEM_SIZE": smem_config[0],
157
162
  }
158
- _, _ = jit_tuner.compile_and_tune(
159
- name="m_grouped_gemm_fp8_fp8_bf16_nt",
160
- keys={
161
- "N": n,
162
- "K": k,
163
- "BLOCK_M": block_m,
164
- "BLOCK_N": block_n,
165
- "SWIZZLE_D_MODE": smem_config[1],
166
- "BLOCK_N_PADDING": smem_config[2],
167
- "NUM_GROUPS": num_groups,
168
- "NUM_STAGES": num_stages,
169
- "NUM_TMA_MULTICAST": tma_multicast_config[0],
170
- "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
171
- "GEMM_TYPE": GemmType.GroupedMasked,
172
- },
173
- space=(),
174
- kwargs=kwargs,
175
- runtime_cls=FP8GemmRuntime,
176
- )
163
+
164
+ code = FP8GemmRuntime.generate(kwargs)
165
+ _ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
177
166
 
178
167
 
179
168
  def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
@@ -187,31 +176,26 @@ def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
187
176
  num_tma_threads = 128
188
177
  num_math_threads_per_group = 128
189
178
  kwargs = {
179
+ "GEMM_TYPE": GemmType.GroupedContiguous,
190
180
  "NUM_TMA_THREADS": num_tma_threads,
191
181
  "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
182
+ "N": n,
183
+ "K": k,
184
+ "NUM_GROUPS": 1,
185
+ "BLOCK_M": block_m,
186
+ "BLOCK_N": block_n,
192
187
  "BLOCK_K": block_k,
188
+ "SWIZZLE_D_MODE": smem_config[1],
189
+ "BLOCK_N_PADDING": smem_config[2],
190
+ "NUM_STAGES": num_stages,
191
+ "NUM_TMA_MULTICAST": tma_multicast_config[0],
192
+ "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
193
193
  "NUM_SMS": num_sms,
194
194
  "SMEM_SIZE": smem_config[0],
195
195
  }
196
- _, _ = jit_tuner.compile_and_tune(
197
- name="m_grouped_gemm_fp8_fp8_bf16_nt",
198
- keys={
199
- "N": n,
200
- "K": k,
201
- "BLOCK_M": block_m,
202
- "BLOCK_N": block_n,
203
- "SWIZZLE_D_MODE": smem_config[1],
204
- "BLOCK_N_PADDING": smem_config[2],
205
- "NUM_GROUPS": num_groups,
206
- "NUM_STAGES": num_stages,
207
- "NUM_TMA_MULTICAST": tma_multicast_config[0],
208
- "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
209
- "GEMM_TYPE": GemmType.GroupedContiguous,
210
- },
211
- space=(),
212
- kwargs=kwargs,
213
- runtime_cls=FP8GemmRuntime,
214
- )
196
+
197
+ code = FP8GemmRuntime.generate(kwargs)
198
+ _ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
215
199
 
216
200
 
217
201
  def _compile_gemm_nt_f8f8bf16_one(
@@ -228,30 +212,26 @@ def _compile_gemm_nt_f8f8bf16_one(
228
212
  "GEMM_TYPE": GemmType.Normal,
229
213
  "NUM_TMA_THREADS": num_tma_threads,
230
214
  "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
215
+ "N": n,
216
+ "K": k,
231
217
  "NUM_GROUPS": 1,
218
+ "BLOCK_M": block_m,
219
+ "BLOCK_N": block_n,
232
220
  "BLOCK_K": block_k,
221
+ "SWIZZLE_D_MODE": smem_config[1],
222
+ "BLOCK_N_PADDING": smem_config[2],
223
+ "NUM_STAGES": num_stages,
224
+ "NUM_TMA_MULTICAST": tma_multicast_config[0],
225
+ "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
233
226
  "NUM_SMS": num_sms,
234
227
  "SMEM_SIZE": smem_config[0],
235
228
  }
236
- _, _ = jit_tuner.compile_and_tune(
237
- name="gemm_fp8_fp8_bf16_nt",
238
- keys={
239
- "N": n,
240
- "K": k,
241
- "BLOCK_M": block_m,
242
- "BLOCK_N": block_n,
243
- "SWIZZLE_D_MODE": smem_config[1],
244
- "BLOCK_N_PADDING": smem_config[2],
245
- "NUM_STAGES": num_stages,
246
- "NUM_TMA_MULTICAST": tma_multicast_config[0],
247
- "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
248
- },
249
- space=(),
250
- kwargs=kwargs,
251
- runtime_cls=FP8GemmRuntime,
252
- )
253
229
 
230
+ code = FP8GemmRuntime.generate(kwargs)
231
+ _ = build("gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
254
232
 
233
+
234
+ # TODO further refactor warmup-related
255
235
  _KERNEL_HELPER_DICT: Dict[DeepGemmKernelType, DeepGemmKernelHelper] = {
256
236
  DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED: DeepGemmKernelHelper(
257
237
  name="m_grouped_gemm_fp8_fp8_bf16_nt_masked",
@@ -284,7 +264,6 @@ def _maybe_compile_deep_gemm_one_type_all(
284
264
  num_groups: int,
285
265
  m_list: Optional[List[int]] = None,
286
266
  ) -> None:
287
-
288
267
  global _INITIALIZATION_DICT
289
268
  global _BUILTIN_M_LIST
290
269
 
@@ -318,56 +297,6 @@ def _maybe_compile_deep_gemm_one_type_all(
318
297
  thread_map(compile_func, collected_configs, max_workers=_COMPILE_WORKERS)
319
298
 
320
299
 
321
- def grouped_gemm_nt_f8f8bf16_masked(
322
- lhs: Tuple[torch.Tensor, torch.Tensor],
323
- rhs: Tuple[torch.Tensor, torch.Tensor],
324
- out: torch.Tensor,
325
- masked_m: torch.Tensor,
326
- expected_m: int,
327
- ):
328
- num_groups, _, k = lhs[0].shape
329
- _, n, _ = rhs[0].shape
330
-
331
- kernel_type = DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED
332
- _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
333
-
334
- with _log_jit_build(expected_m, n, k, kernel_type):
335
- deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
336
- lhs, rhs, out, masked_m, expected_m
337
- )
338
-
339
-
340
- def grouped_gemm_nt_f8f8bf16_contig(
341
- lhs: Tuple[torch.Tensor, torch.Tensor],
342
- rhs: Tuple[torch.Tensor, torch.Tensor],
343
- out: torch.Tensor,
344
- m_indices: torch.Tensor,
345
- ):
346
- m, k = lhs[0].shape
347
- num_groups, n, _ = rhs[0].shape
348
-
349
- kernel_type = DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG
350
- _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
351
-
352
- with _log_jit_build(m, n, k, kernel_type):
353
- deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs, rhs, out, m_indices)
354
-
355
-
356
- def gemm_nt_f8f8bf16(
357
- lhs: Tuple[torch.Tensor, torch.Tensor],
358
- rhs: Tuple[torch.Tensor, torch.Tensor],
359
- out: torch.Tensor,
360
- ):
361
- m, k = lhs[0].shape
362
- n, _ = rhs[0].shape
363
-
364
- kernel_type = DeepGemmKernelType.GEMM_NT_F8F8BF16
365
- _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, 1)
366
-
367
- with _log_jit_build(m, n, k, kernel_type):
368
- deep_gemm.gemm_fp8_fp8_bf16_nt(lhs, rhs, out)
369
-
370
-
371
300
  @contextmanager
372
301
  def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
373
302
  if _IN_PRECOMPILE_STAGE:
@@ -382,7 +311,8 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
382
311
  ret = origin_func(self, *args, **kwargs)
383
312
  if ret is None:
384
313
  kernel_helper = _KERNEL_HELPER_DICT[kernel_type]
385
- _compile_warning_2()
314
+ if not DEEPGEMM_BLACKWELL:
315
+ _compile_warning_2()
386
316
  logger.warning(
387
317
  f"DeepGEMM JIT Compiling for <{kernel_helper.name}> M={M}, N={N}, K={K}. Please wait."
388
318
  )
@@ -391,3 +321,15 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
391
321
  RuntimeCache.get = __patched_func
392
322
  yield
393
323
  RuntimeCache.get = origin_func
324
+
325
+
326
+ @contextmanager
327
+ def deep_gemm_execution_hook(
328
+ m: int, n: int, k: int, num_groups: int, kernel_type: DeepGemmKernelType
329
+ ):
330
+ # not supported yet
331
+ if not DEEPGEMM_BLACKWELL:
332
+ _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
333
+
334
+ with _log_jit_build(m, n, k, kernel_type):
335
+ yield
@@ -0,0 +1,32 @@
1
+ import logging
2
+
3
+ from sglang.srt.utils import get_bool_env_var, get_device_sm
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+
8
+ def _compute_enable_deep_gemm():
9
+ sm_version = get_device_sm()
10
+ if sm_version < 90:
11
+ return False
12
+
13
+ try:
14
+ import deep_gemm
15
+ except ImportError:
16
+ logger.warning("Failed to import deep_gemm, disable ENABLE_JIT_DEEPGEMM.")
17
+ return False
18
+
19
+ return get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true")
20
+
21
+
22
+ ENABLE_JIT_DEEPGEMM = _compute_enable_deep_gemm()
23
+
24
+ try:
25
+ from deep_gemm import fp8_gemm_nt
26
+
27
+ # They have not given a name to this breaking change
28
+ DEEPGEMM_BLACKWELL = True
29
+ except ImportError:
30
+ DEEPGEMM_BLACKWELL = False
31
+
32
+ DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL
@@ -0,0 +1,110 @@
1
+ import logging
2
+ from contextlib import contextmanager
3
+ from typing import Tuple
4
+
5
+ import torch
6
+
7
+ from sglang.srt.layers.quantization.deep_gemm_wrapper import compile_utils
8
+ from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
9
+ DEEPGEMM_BLACKWELL,
10
+ DEEPGEMM_SCALE_UE8M0,
11
+ ENABLE_JIT_DEEPGEMM,
12
+ )
13
+ from sglang.srt.server_args import ServerArgs
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ if ENABLE_JIT_DEEPGEMM:
18
+ import deep_gemm
19
+
20
+ if DEEPGEMM_BLACKWELL:
21
+ from deep_gemm import fp8_gemm_nt as _gemm_nt_f8f8bf16_raw
22
+ from deep_gemm import (
23
+ fp8_m_grouped_gemm_nt_masked as _grouped_gemm_nt_f8f8bf16_masked_raw,
24
+ )
25
+ from deep_gemm import (
26
+ m_grouped_fp8_gemm_nt_contiguous as _grouped_gemm_nt_f8f8bf16_contig_raw,
27
+ )
28
+ else:
29
+ from deep_gemm import gemm_fp8_fp8_bf16_nt as _gemm_nt_f8f8bf16_raw
30
+ from deep_gemm import get_col_major_tma_aligned_tensor
31
+ from deep_gemm import (
32
+ m_grouped_gemm_fp8_fp8_bf16_nt_contiguous as _grouped_gemm_nt_f8f8bf16_contig_raw,
33
+ )
34
+ from deep_gemm import (
35
+ m_grouped_gemm_fp8_fp8_bf16_nt_masked as _grouped_gemm_nt_f8f8bf16_masked_raw,
36
+ )
37
+
38
+
39
+ def grouped_gemm_nt_f8f8bf16_masked(
40
+ lhs: Tuple[torch.Tensor, torch.Tensor],
41
+ rhs: Tuple[torch.Tensor, torch.Tensor],
42
+ out: torch.Tensor,
43
+ masked_m: torch.Tensor,
44
+ expected_m: int,
45
+ recipe=None,
46
+ ):
47
+ num_groups, _, k = lhs[0].shape
48
+ _, n, _ = rhs[0].shape
49
+ kernel_type = compile_utils.DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED
50
+
51
+ with compile_utils.deep_gemm_execution_hook(
52
+ expected_m, n, k, num_groups, kernel_type
53
+ ):
54
+ _grouped_gemm_nt_f8f8bf16_masked_raw(
55
+ lhs,
56
+ rhs,
57
+ out,
58
+ masked_m,
59
+ expected_m,
60
+ **({"recipe": recipe} if DEEPGEMM_BLACKWELL else {})
61
+ )
62
+
63
+
64
+ def grouped_gemm_nt_f8f8bf16_contig(
65
+ lhs: Tuple[torch.Tensor, torch.Tensor],
66
+ rhs: Tuple[torch.Tensor, torch.Tensor],
67
+ out: torch.Tensor,
68
+ m_indices: torch.Tensor,
69
+ ):
70
+ m, k = lhs[0].shape
71
+ num_groups, n, _ = rhs[0].shape
72
+ kernel_type = compile_utils.DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG
73
+
74
+ with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
75
+ _grouped_gemm_nt_f8f8bf16_contig_raw(lhs, rhs, out, m_indices)
76
+
77
+
78
+ def gemm_nt_f8f8bf16(
79
+ lhs: Tuple[torch.Tensor, torch.Tensor],
80
+ rhs: Tuple[torch.Tensor, torch.Tensor],
81
+ out: torch.Tensor,
82
+ ):
83
+ m, k = lhs[0].shape
84
+ n, _ = rhs[0].shape
85
+ num_groups = 1
86
+ kernel_type = compile_utils.DeepGemmKernelType.GEMM_NT_F8F8BF16
87
+
88
+ with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
89
+ _gemm_nt_f8f8bf16_raw(
90
+ lhs,
91
+ rhs,
92
+ out,
93
+ )
94
+
95
+
96
+ def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
97
+ compile_utils.update_deep_gemm_config(gpu_id, server_args)
98
+
99
+
100
+ @contextmanager
101
+ def configure_deep_gemm_num_sms(num_sms):
102
+ if num_sms is None:
103
+ yield
104
+ else:
105
+ original_num_sms = deep_gemm.get_num_sms()
106
+ deep_gemm.set_num_sms(num_sms)
107
+ try:
108
+ yield
109
+ finally:
110
+ deep_gemm.set_num_sms(original_num_sms)
@@ -49,10 +49,9 @@ from sglang.srt.layers.quantization.fp8_kernel import (
49
49
  )
50
50
  from sglang.srt.layers.quantization.fp8_utils import (
51
51
  apply_fp8_linear,
52
- apply_w8a8_block_fp8_linear,
53
52
  cutlass_fp8_supported,
53
+ dispatch_w8a8_block_fp8_linear,
54
54
  input_to_float8,
55
- is_sm100_supported,
56
55
  normalize_e4m3fn_to_e4m3fnuz,
57
56
  )
58
57
  from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
@@ -63,6 +62,7 @@ from sglang.srt.layers.quantization.utils import (
63
62
  per_tensor_dequantize,
64
63
  requantize_with_max_scale,
65
64
  )
65
+ from sglang.srt.layers.utils import is_sm100_supported
66
66
  from sglang.srt.utils import (
67
67
  get_bool_env_var,
68
68
  is_cuda,
@@ -77,8 +77,8 @@ _is_cuda = is_cuda()
77
77
 
78
78
  _is_fp8_fnuz = is_fp8_fnuz()
79
79
 
80
- use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT")
81
- use_aiter_moe = get_bool_env_var("SGLANG_AITER_MOE")
80
+ _use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT")
81
+ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
82
82
 
83
83
  if _is_hip:
84
84
  from aiter import ActivationType, QuantType
@@ -209,6 +209,8 @@ class Fp8LinearMethod(LinearMethodBase):
209
209
  # Marlin doesn't support block-wise fp8
210
210
  self.use_marlin = False
211
211
 
212
+ self.w8a8_block_fp8_linear = dispatch_w8a8_block_fp8_linear()
213
+
212
214
  def create_weights(
213
215
  self,
214
216
  layer: torch.nn.Module,
@@ -417,7 +419,7 @@ class Fp8LinearMethod(LinearMethodBase):
417
419
  )
418
420
 
419
421
  if self.block_quant:
420
- return apply_w8a8_block_fp8_linear(
422
+ return self.w8a8_block_fp8_linear(
421
423
  input=x,
422
424
  weight=layer.weight,
423
425
  block_size=self.quant_config.weight_block_size,
@@ -485,7 +487,7 @@ class Fp8MoEMethod:
485
487
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
486
488
 
487
489
  if self.quant_config.is_checkpoint_fp8_serialized:
488
- params_dtype = torch.uint32 if use_hip_int4 else torch.float8_e4m3fn
490
+ params_dtype = torch.uint32 if _use_hip_int4 else torch.float8_e4m3fn
489
491
  tp_size = get_tensor_model_parallel_world_size()
490
492
  if self.block_quant:
491
493
  block_n, block_k = (
@@ -510,7 +512,7 @@ class Fp8MoEMethod:
510
512
  )
511
513
 
512
514
  # WEIGHTS
513
- if _is_hip and use_hip_int4:
515
+ if _is_hip and _use_hip_int4:
514
516
  # INT4 MoE weight - INT32 packed
515
517
  w13_weight = torch.nn.Parameter(
516
518
  torch.empty(
@@ -571,7 +573,7 @@ class Fp8MoEMethod:
571
573
  layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
572
574
  assert self.quant_config.activation_scheme == "dynamic"
573
575
  if (
574
- get_bool_env_var("CUTLASS_MOE")
576
+ get_bool_env_var("SGLANG_CUTLASS_MOE")
575
577
  and self.cutlass_fp8_supported
576
578
  and is_sm100_supported()
577
579
  ):
@@ -639,7 +641,7 @@ class Fp8MoEMethod:
639
641
  layer.register_parameter("w13_weight_scale", w13_weight_scale)
640
642
  layer.register_parameter("w2_weight_scale", w2_weight_scale)
641
643
 
642
- if _is_hip: # and use_aiter_moe: TODO: add check back after triton kernel
644
+ if _is_hip: # _use_aiter: TODO: add check back after triton kernel
643
645
  # ROCm - using column scaling, duplicate scaling numbers in case per tensor scaling
644
646
  w13_weight_scale1 = torch.nn.Parameter(
645
647
  torch.ones(num_experts, 2 * intermediate_size, dtype=torch.float32),
@@ -666,7 +668,7 @@ class Fp8MoEMethod:
666
668
  set_weight_attrs(w13_weight_scale, extra_weight_attrs)
667
669
  set_weight_attrs(w2_weight_scale, extra_weight_attrs)
668
670
 
669
- if _is_hip and use_hip_int4:
671
+ if _is_hip and _use_hip_int4:
670
672
  extra_weight_attrs.update(
671
673
  {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
672
674
  )
@@ -698,7 +700,7 @@ class Fp8MoEMethod:
698
700
  layer.w2_input_scale = None
699
701
 
700
702
  def process_weights_after_loading(self, layer: Module) -> None:
701
- if _is_hip and use_hip_int4:
703
+ if _is_hip and _use_hip_int4:
702
704
  self.process_weights_hip_int4(layer)
703
705
  return
704
706
 
@@ -729,7 +731,7 @@ class Fp8MoEMethod:
729
731
  )
730
732
  layer.w2_input_scale = None
731
733
 
732
- if _is_hip and use_aiter_moe:
734
+ if _use_aiter:
733
735
  # Pre-shuffle weights
734
736
  layer.w13_weight.data = shuffle_weight(
735
737
  layer.w13_weight.contiguous(), (16, 16)
@@ -851,7 +853,7 @@ class Fp8MoEMethod:
851
853
  return
852
854
 
853
855
  def process_weights_hip_int4(self, layer: Module):
854
- # TODO: and use_aiter_moe: add after triton kernel added
856
+ # TODO: _use_aiter: add after triton kernel added
855
857
  # INT4-FP8 (INT4 MoE Weight, FP8 Compute)
856
858
  # Weight Permutation
857
859
  layer.w13_weight = torch.nn.Parameter(
@@ -898,7 +900,7 @@ class Fp8MoEMethod:
898
900
  padding_size, # Avoid circular import
899
901
  )
900
902
 
901
- if use_aiter_moe:
903
+ if _use_aiter:
902
904
  layer.w13_weight = torch.nn.Parameter(
903
905
  shuffle_weight(layer.w13_weight.data, (16, 16)),
904
906
  requires_grad=False,
@@ -909,7 +911,7 @@ class Fp8MoEMethod:
909
911
  requires_grad=False,
910
912
  )
911
913
  torch.cuda.empty_cache()
912
- # ROCm (use_aiter_moe): using column-wise scaling
914
+ # ROCm (_use_aiter): using column-wise scaling
913
915
  layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1)
914
916
  layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1)
915
917
  elif get_bool_env_var("SGLANG_MOE_PADDING"):
@@ -935,6 +937,7 @@ class Fp8MoEMethod:
935
937
  use_grouped_topk: bool,
936
938
  topk_group: Optional[int] = None,
937
939
  num_expert_group: Optional[int] = None,
940
+ num_fused_shared_experts: int = 0,
938
941
  custom_routing_function: Optional[Callable] = None,
939
942
  correction_bias: Optional[torch.Tensor] = None,
940
943
  activation: str = "silu",
@@ -955,6 +958,7 @@ class Fp8MoEMethod:
955
958
  renormalize=renormalize,
956
959
  topk_group=topk_group,
957
960
  num_expert_group=num_expert_group,
961
+ num_fused_shared_experts=num_fused_shared_experts,
958
962
  custom_routing_function=custom_routing_function,
959
963
  correction_bias=correction_bias,
960
964
  routed_scaling_factor=routed_scaling_factor,
@@ -973,14 +977,14 @@ class Fp8MoEMethod:
973
977
  return ret
974
978
 
975
979
  if (
976
- get_bool_env_var("CUTLASS_MOE")
980
+ get_bool_env_var("SGLANG_CUTLASS_MOE")
977
981
  and self.cutlass_fp8_supported
978
982
  and self.block_quant
979
983
  and is_sm100_supported()
980
984
  ):
981
- from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts
985
+ from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
982
986
 
983
- return cutlass_fused_experts(
987
+ return cutlass_fused_experts_fp8(
984
988
  x,
985
989
  layer.w13_weight.transpose(1, 2),
986
990
  layer.w2_weight.transpose(1, 2),
@@ -1026,6 +1030,7 @@ class Fp8MoEMethod:
1026
1030
  a2_scale=layer.w2_input_scale,
1027
1031
  block_shape=self.quant_config.weight_block_size,
1028
1032
  no_combine=no_combine,
1033
+ routed_scaling_factor=routed_scaling_factor,
1029
1034
  )
1030
1035
 
1031
1036
  def maybe_apply_hip_fused_experts(
@@ -1037,8 +1042,8 @@ class Fp8MoEMethod:
1037
1042
  activation: str = "silu",
1038
1043
  no_combine: bool = False,
1039
1044
  ) -> Optional[torch.Tensor]:
1040
- if use_hip_int4:
1041
- # TODO: add triton kernel and add check use_aiter_moe
1045
+ if _use_hip_int4:
1046
+ # TODO: add triton kernel and add check _use_aiter
1042
1047
  assert not no_combine, f"{no_combine=} is not supported."
1043
1048
  return ck_moe_2stages(
1044
1049
  x,
@@ -1054,13 +1059,13 @@ class Fp8MoEMethod:
1054
1059
  ),
1055
1060
  )
1056
1061
 
1057
- if use_aiter_moe:
1062
+ if _use_aiter:
1058
1063
  assert not no_combine, f"{no_combine=} is not supported."
1059
1064
  if self.block_quant:
1060
- # TODO(use_aiter_moe): FP8 block_quant only supports 'silu' for the time-being.
1065
+ # TODO(_use_aiter): FP8 block_quant only supports 'silu' for the time-being.
1061
1066
  assert (
1062
1067
  activation == "silu"
1063
- ), f"use_aiter_moe: FP8 bloack_quant {activation=} will be supported later, unset use_aiter_moe"
1068
+ ), f"_use_aiter: FP8 bloack_quant {activation=} will be supported later, unset _use_aiter"
1064
1069
  return asm_moe(
1065
1070
  x,
1066
1071
  layer.w13_weight,