sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,6 @@
13
13
  # ==============================================================================
14
14
  """ModelRunner runs the forward passes of the models."""
15
15
 
16
- import collections
17
16
  import datetime
18
17
  import gc
19
18
  import inspect
@@ -36,8 +35,10 @@ from sglang.srt.distributed import (
36
35
  init_distributed_environment,
37
36
  initialize_model_parallel,
38
37
  set_custom_all_reduce,
38
+ set_mscclpp_all_reduce,
39
39
  )
40
40
  from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state
41
+ from sglang.srt.layers.attention.tbo_backend import TboAttnBackend
41
42
  from sglang.srt.layers.dp_attention import (
42
43
  get_attention_tp_group,
43
44
  get_attention_tp_size,
@@ -51,8 +52,24 @@ from sglang.srt.layers.quantization.deep_gemm import (
51
52
  )
52
53
  from sglang.srt.layers.sampler import Sampler
53
54
  from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
55
+ from sglang.srt.layers.utils import is_sm100_supported
54
56
  from sglang.srt.lora.lora_manager import LoRAManager
55
- from sglang.srt.managers.schedule_batch import global_server_args_dict
57
+ from sglang.srt.managers.eplb_manager import EPLBManager
58
+ from sglang.srt.managers.expert_distribution import (
59
+ ExpertDistributionRecorder,
60
+ get_global_expert_distribution_recorder,
61
+ set_global_expert_distribution_recorder,
62
+ )
63
+ from sglang.srt.managers.expert_location import (
64
+ ExpertLocationMetadata,
65
+ compute_initial_expert_location_metadata,
66
+ get_global_expert_location_metadata,
67
+ set_global_expert_location_metadata,
68
+ )
69
+ from sglang.srt.managers.schedule_batch import (
70
+ GLOBAL_SERVER_ARGS_KEYS,
71
+ global_server_args_dict,
72
+ )
56
73
  from sglang.srt.mem_cache.memory_pool import (
57
74
  DoubleSparseTokenToKVPool,
58
75
  MHATokenToKVPool,
@@ -62,13 +79,10 @@ from sglang.srt.mem_cache.memory_pool import (
62
79
  )
63
80
  from sglang.srt.mem_cache.paged_allocator import PagedTokenToKVPoolAllocator
64
81
  from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
82
+ from sglang.srt.model_executor.expert_location_updater import ExpertLocationUpdater
65
83
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
66
84
  from sglang.srt.model_loader import get_model
67
- from sglang.srt.model_loader.loader import (
68
- DefaultModelLoader,
69
- device_loading_context,
70
- get_model_loader,
71
- )
85
+ from sglang.srt.model_loader.loader import DefaultModelLoader, get_model_loader
72
86
  from sglang.srt.model_loader.utils import set_default_torch_dtype
73
87
  from sglang.srt.model_loader.weight_utils import default_weight_loader
74
88
  from sglang.srt.patch_torch import monkey_patch_torch_reductions
@@ -78,6 +92,7 @@ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
78
92
  from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
79
93
  from sglang.srt.utils import (
80
94
  MultiprocessingSerializer,
95
+ cpu_has_amx_support,
81
96
  enable_show_time_cost,
82
97
  get_available_gpu_memory,
83
98
  get_bool_env_var,
@@ -94,6 +109,8 @@ from sglang.srt.utils import (
94
109
  set_cuda_arch,
95
110
  )
96
111
 
112
+ _is_hip = is_hip()
113
+
97
114
  # Use a small KV cache pool size for tests in CI
98
115
  SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None)
99
116
 
@@ -103,6 +120,19 @@ UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300
103
120
  logger = logging.getLogger(__name__)
104
121
 
105
122
 
123
+ class RankZeroFilter(logging.Filter):
124
+ """Filter that only allows INFO level logs from rank 0, but allows all other levels from any rank."""
125
+
126
+ def __init__(self, is_rank_zero):
127
+ super().__init__()
128
+ self.is_rank_zero = is_rank_zero
129
+
130
+ def filter(self, record):
131
+ if record.levelno == logging.INFO:
132
+ return self.is_rank_zero
133
+ return True
134
+
135
+
106
136
  class ModelRunner:
107
137
  """ModelRunner runs the forward passes of the models."""
108
138
 
@@ -126,6 +156,10 @@ class ModelRunner:
126
156
  self.mem_fraction_static = mem_fraction_static
127
157
  self.device = server_args.device
128
158
  self.gpu_id = gpu_id
159
+
160
+ # Apply the rank zero filter to logger
161
+ if not any(isinstance(f, RankZeroFilter) for f in logger.filters):
162
+ logger.addFilter(RankZeroFilter(tp_rank == 0))
129
163
  self.tp_rank = tp_rank
130
164
  self.tp_size = tp_size
131
165
  self.pp_rank = pp_rank
@@ -135,7 +169,9 @@ class ModelRunner:
135
169
  self.is_draft_worker = is_draft_worker
136
170
  self.is_generation = model_config.is_generation
137
171
  self.is_multimodal = model_config.is_multimodal
138
- self.should_log = tp_rank == 0
172
+ self.is_multimodal_chunked_prefill_supported = (
173
+ model_config.is_multimodal_chunked_prefill_supported
174
+ )
139
175
  self.spec_algorithm = SpeculativeAlgorithm.from_string(
140
176
  server_args.speculative_algorithm
141
177
  )
@@ -145,6 +181,8 @@ class ModelRunner:
145
181
  self.use_mla_backend = self.model_config.attention_arch == AttentionArch.MLA
146
182
  self.attention_chunk_size = model_config.attention_chunk_size
147
183
 
184
+ self.forward_pass_id = 0
185
+
148
186
  # Model-specific adjustment
149
187
  self.model_specific_adjustment()
150
188
 
@@ -153,28 +191,10 @@ class ModelRunner:
153
191
 
154
192
  # Global vars
155
193
  global_server_args_dict.update(
156
- {
157
- "attention_backend": server_args.attention_backend,
158
- "debug_tensor_dump_inject": server_args.debug_tensor_dump_inject,
159
- "debug_tensor_dump_output_folder": server_args.debug_tensor_dump_output_folder,
160
- "deepep_mode": server_args.deepep_mode,
161
- "device": server_args.device,
162
- "disable_chunked_prefix_cache": server_args.disable_chunked_prefix_cache,
163
- "disable_radix_cache": server_args.disable_radix_cache,
164
- "enable_nan_detection": server_args.enable_nan_detection,
165
- "enable_dp_attention": server_args.enable_dp_attention,
166
- "enable_ep_moe": server_args.enable_ep_moe,
167
- "enable_deepep_moe": server_args.enable_deepep_moe,
168
- "flashinfer_mla_disable_ragged": server_args.flashinfer_mla_disable_ragged,
169
- "moe_dense_tp_size": server_args.moe_dense_tp_size,
170
- "n_share_experts_fusion": server_args.n_share_experts_fusion,
171
- "triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32,
172
- "torchao_config": server_args.torchao_config,
173
- "sampling_backend": server_args.sampling_backend,
174
- "speculative_accept_threshold_single": server_args.speculative_accept_threshold_single,
175
- "speculative_accept_threshold_acc": server_args.speculative_accept_threshold_acc,
194
+ {k: getattr(server_args, k) for k in GLOBAL_SERVER_ARGS_KEYS}
195
+ | {
196
+ # TODO it is indeed not a "server args"
176
197
  "use_mla_backend": self.use_mla_backend,
177
- "mm_attention_backend": server_args.mm_attention_backend,
178
198
  }
179
199
  )
180
200
 
@@ -202,6 +222,32 @@ class ModelRunner:
202
222
  enable=self.server_args.enable_memory_saver
203
223
  )
204
224
 
225
+ if not self.is_draft_worker:
226
+ set_global_expert_location_metadata(
227
+ compute_initial_expert_location_metadata(server_args, self.model_config)
228
+ )
229
+ if self.tp_rank == 0 and get_bool_env_var(
230
+ "SGLANG_LOG_EXPERT_LOCATION_METADATA"
231
+ ):
232
+ logger.info(
233
+ f"Initial expert_location_metadata: {get_global_expert_location_metadata().debug_str()}"
234
+ )
235
+
236
+ set_global_expert_distribution_recorder(
237
+ ExpertDistributionRecorder.init_new(
238
+ server_args,
239
+ get_global_expert_location_metadata(),
240
+ rank=self.tp_rank,
241
+ )
242
+ )
243
+
244
+ self.eplb_manager = (
245
+ EPLBManager(self)
246
+ if self.server_args.enable_eplb and (not self.is_draft_worker)
247
+ else None
248
+ )
249
+ self.expert_location_updater = ExpertLocationUpdater()
250
+
205
251
  # Load the model
206
252
  self.sampler = Sampler()
207
253
  self.load_model()
@@ -250,6 +296,16 @@ class ModelRunner:
250
296
  def model_specific_adjustment(self):
251
297
  server_args = self.server_args
252
298
 
299
+ if (
300
+ server_args.attention_backend == "intel_amx"
301
+ and server_args.device == "cpu"
302
+ and not cpu_has_amx_support()
303
+ ):
304
+ logger.info(
305
+ "The current platform does not support Intel AMX, will fallback to torch_native backend."
306
+ )
307
+ server_args.attention_backend = "torch_native"
308
+
253
309
  if server_args.attention_backend is None:
254
310
  """
255
311
  Auto select the fastest attention backend.
@@ -259,7 +315,8 @@ class ModelRunner:
259
315
  1.2 In other cases, we will use flashinfer if available, otherwise use triton.
260
316
  2. Models with MLA Architecture and using FA3
261
317
  2.1 We will use FA3 backend on hopper.
262
- 2.2 Otherwise, we will use triton backend.
318
+ 2.2 We will use Flashinfer backend on blackwell.
319
+ 2.3 Otherwise, we will use triton backend.
263
320
  """
264
321
 
265
322
  if not self.use_mla_backend:
@@ -270,6 +327,8 @@ class ModelRunner:
270
327
  and is_fa3_default_architecture(self.model_config.hf_config)
271
328
  ):
272
329
  server_args.attention_backend = "fa3"
330
+ elif _is_hip:
331
+ server_args.attention_backend = "aiter"
273
332
  else:
274
333
  server_args.attention_backend = (
275
334
  "flashinfer" if is_flashinfer_available() else "triton"
@@ -278,31 +337,44 @@ class ModelRunner:
278
337
  # MLA architecture
279
338
  if is_hopper_with_cuda_12_3():
280
339
  server_args.attention_backend = "fa3"
340
+ elif is_sm100_supported():
341
+ server_args.attention_backend = "flashinfer"
342
+ elif _is_hip:
343
+ head_num = self.model_config.get_num_kv_heads(self.tp_size)
344
+ # TODO current aiter only support head number 16 or 128 head number
345
+ if (
346
+ head_num == 128 or head_num == 16
347
+ ) and self.spec_algorithm.is_none():
348
+ server_args.attention_backend = "aiter"
349
+ else:
350
+ server_args.attention_backend = "triton"
281
351
  else:
282
352
  server_args.attention_backend = "triton"
283
- if self.should_log:
284
- logger.info(
285
- f"Attention backend not set. Use {server_args.attention_backend} backend by default."
286
- )
353
+ logger.info(
354
+ f"Attention backend not set. Use {server_args.attention_backend} backend by default."
355
+ )
287
356
  elif self.use_mla_backend:
288
357
  if server_args.device != "cpu":
289
358
  if server_args.attention_backend in [
359
+ "aiter",
290
360
  "flashinfer",
291
361
  "fa3",
292
362
  "triton",
293
363
  "flashmla",
294
364
  "cutlass_mla",
295
365
  ]:
296
- if self.should_log:
297
- logger.info(
298
- f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
299
- )
366
+ logger.info(
367
+ f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
368
+ )
300
369
  else:
301
370
  raise ValueError(
302
371
  f"Invalid attention backend for MLA: {server_args.attention_backend}"
303
372
  )
304
373
  else:
305
- raise ValueError("MLA optimization not supported on CPU.")
374
+ if server_args.attention_backend != "intel_amx":
375
+ raise ValueError(
376
+ "MLA optimization not supported on CPU except for intel_amx backend."
377
+ )
306
378
 
307
379
  if (
308
380
  server_args.attention_backend == "fa3"
@@ -315,10 +387,9 @@ class ModelRunner:
315
387
  server_args.attention_backend = "triton"
316
388
 
317
389
  if server_args.enable_double_sparsity:
318
- if self.should_log:
319
- logger.info(
320
- "Double sparsity optimization is turned on. Use triton backend without CUDA graph."
321
- )
390
+ logger.info(
391
+ "Double sparsity optimization is turned on. Use triton backend without CUDA graph."
392
+ )
322
393
  server_args.attention_backend = "triton"
323
394
  server_args.disable_cuda_graph = True
324
395
  if server_args.ds_heavy_channel_type is None:
@@ -329,26 +400,29 @@ class ModelRunner:
329
400
 
330
401
  if self.is_multimodal:
331
402
  self.mem_fraction_static *= 0.90
332
- if self.should_log:
333
- logger.info(
334
- f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
335
- f"because this is a multimodal model."
336
- )
403
+ logger.info(
404
+ f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
405
+ f"because this is a multimodal model."
406
+ )
407
+ if not self.is_multimodal_chunked_prefill_supported:
408
+ server_args.chunked_prefill_size = -1
337
409
  logger.info(
338
- "Automatically turn off --chunked-prefill-size for multimodal model."
410
+ f"Automatically turn of --chunked-prefill-size as it is not supported for "
411
+ f"{self.model_config.hf_config.model_type}"
339
412
  )
340
- server_args.chunked_prefill_size = -1
341
413
 
342
414
  if not self.use_mla_backend:
343
415
  server_args.disable_chunked_prefix_cache = True
344
416
  elif self.page_size > 1:
345
- if self.should_log:
346
- logger.info("Disable chunked prefix cache when page size > 1.")
417
+ logger.info("Disable chunked prefix cache when page size > 1.")
347
418
  server_args.disable_chunked_prefix_cache = True
348
419
 
349
420
  if not server_args.disable_chunked_prefix_cache:
350
- if self.should_log:
351
- logger.info("Chunked prefix cache is turned on.")
421
+ logger.info("Chunked prefix cache is turned on.")
422
+
423
+ if server_args.attention_backend == "aiter":
424
+ if self.model_config.context_len > 8192:
425
+ self.mem_fraction_static *= 0.85
352
426
 
353
427
  def init_torch_distributed(self):
354
428
  logger.info("Init torch distributed begin.")
@@ -381,6 +455,7 @@ class ModelRunner:
381
455
  else:
382
456
  dist_init_method = f"tcp://127.0.0.1:{self.dist_port}"
383
457
  set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
458
+ set_mscclpp_all_reduce(self.server_args.enable_mscclpp)
384
459
 
385
460
  if not self.is_draft_worker:
386
461
  # Only initialize the distributed environment on the target model worker.
@@ -445,10 +520,9 @@ class ModelRunner:
445
520
  torch.set_num_threads(1)
446
521
  if self.device == "cuda":
447
522
  if torch.cuda.get_device_capability()[0] < 8:
448
- if self.should_log:
449
- logger.info(
450
- "Compute capability below sm80. Use float16 due to lack of bfloat16 support."
451
- )
523
+ logger.info(
524
+ "Compute capability below sm80. Use float16 due to lack of bfloat16 support."
525
+ )
452
526
  self.server_args.dtype = "float16"
453
527
  self.model_config.dtype = torch.float16
454
528
  if torch.cuda.get_device_capability()[1] < 5:
@@ -484,11 +558,10 @@ class ModelRunner:
484
558
  self.model.load_kv_cache_scales(
485
559
  self.server_args.quantization_param_path
486
560
  )
487
- if self.should_log:
488
- logger.info(
489
- "Loaded KV cache scaling factors from %s",
490
- self.server_args.quantization_param_path,
491
- )
561
+ logger.info(
562
+ "Loaded KV cache scaling factors from %s",
563
+ self.server_args.quantization_param_path,
564
+ )
492
565
  else:
493
566
  raise RuntimeError(
494
567
  "Using FP8 KV cache and scaling factors provided but "
@@ -531,6 +604,19 @@ class ModelRunner:
531
604
  f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node."
532
605
  ) from None
533
606
 
607
+ def update_expert_location(
608
+ self,
609
+ new_expert_location_metadata: ExpertLocationMetadata,
610
+ update_layer_ids: List[int],
611
+ ):
612
+ self.expert_location_updater.update(
613
+ self.model.routed_experts_weights_of_layer,
614
+ new_expert_location_metadata,
615
+ update_layer_ids=update_layer_ids,
616
+ nnodes=self.server_args.nnodes,
617
+ rank=self.tp_rank,
618
+ )
619
+
534
620
  def update_weights_from_disk(
535
621
  self, model_path: str, load_format: str
536
622
  ) -> tuple[bool, str]:
@@ -552,13 +638,7 @@ class ModelRunner:
552
638
 
553
639
  def get_weight_iter(config):
554
640
  iter = loader._get_weights_iterator(
555
- DefaultModelLoader.Source(
556
- config.model_path,
557
- revision=config.revision,
558
- fall_back_to_pt=getattr(
559
- self.model, "fall_back_to_pt_during_load", True
560
- ),
561
- )
641
+ DefaultModelLoader.Source.init_new(config, self.model)
562
642
  )
563
643
  return iter
564
644
 
@@ -631,7 +711,6 @@ class ModelRunner:
631
711
  rank=rank,
632
712
  group_name=group_name,
633
713
  )
634
- dist.barrier(group=self._model_update_group, device_ids=[rank])
635
714
  return True, "Succeeded to initialize custom process group."
636
715
  except Exception as e:
637
716
  message = f"Failed to initialize custom process group: {e}."
@@ -726,12 +805,15 @@ class ModelRunner:
726
805
  distributed=get_world_group().world_size > 1,
727
806
  cpu_group=get_world_group().cpu_group,
728
807
  )
729
- if self.use_mla_backend:
730
- num_layers = (
731
- self.model_config.num_hidden_layers
732
- if not self.is_draft_worker
733
- else self.model_config.hf_config.num_nextn_predict_layers
808
+ if self.is_draft_worker:
809
+ num_layers = getattr(
810
+ self.model_config.hf_config,
811
+ "num_nextn_predict_layers",
812
+ self.num_effective_layers,
734
813
  )
814
+ else:
815
+ num_layers = self.num_effective_layers
816
+ if self.use_mla_backend:
735
817
  # FIXME: pipeline parallelism is not compatible with mla backend
736
818
  assert self.pp_size == 1
737
819
  cell_size = (
@@ -743,7 +825,7 @@ class ModelRunner:
743
825
  cell_size = (
744
826
  self.model_config.get_num_kv_heads(get_attention_tp_size())
745
827
  * self.model_config.head_dim
746
- * self.num_effective_layers
828
+ * num_layers
747
829
  * 2
748
830
  * torch._utils._element_size(self.kv_cache_dtype)
749
831
  )
@@ -762,7 +844,7 @@ class ModelRunner:
762
844
  if self.server_args.kv_cache_dtype == "auto":
763
845
  self.kv_cache_dtype = self.dtype
764
846
  elif self.server_args.kv_cache_dtype == "fp8_e5m2":
765
- if is_hip(): # Using natively supported format
847
+ if _is_hip: # Using natively supported format
766
848
  self.kv_cache_dtype = torch.float8_e5m2fnuz
767
849
  else:
768
850
  self.kv_cache_dtype = torch.float8_e5m2
@@ -834,12 +916,26 @@ class ModelRunner:
834
916
  )
835
917
 
836
918
  if self.req_to_token_pool is None:
837
- self.req_to_token_pool = ReqToTokenPool(
838
- size=max_num_reqs + 1,
839
- max_context_len=self.model_config.context_len + 4,
840
- device=self.device,
841
- enable_memory_saver=self.server_args.enable_memory_saver,
842
- )
919
+ if self.server_args.disaggregation_mode == "decode":
920
+ from sglang.srt.disaggregation.decode import DecodeReqToTokenPool
921
+
922
+ # subscribe memory for pre-allocated requests
923
+ # if max_num_reqs <= 32, we pre-allocate 2x requests
924
+ pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else 0
925
+ self.req_to_token_pool = DecodeReqToTokenPool(
926
+ size=max_num_reqs,
927
+ max_context_len=self.model_config.context_len + 4,
928
+ device=self.device,
929
+ enable_memory_saver=self.server_args.enable_memory_saver,
930
+ pre_alloc_size=pre_alloc_size,
931
+ )
932
+ else:
933
+ self.req_to_token_pool = ReqToTokenPool(
934
+ size=max_num_reqs,
935
+ max_context_len=self.model_config.context_len + 4,
936
+ device=self.device,
937
+ enable_memory_saver=self.server_args.enable_memory_saver,
938
+ )
843
939
  else:
844
940
  # Draft worker shares req_to_token_pool with the target worker.
845
941
  assert self.is_draft_worker
@@ -924,6 +1020,13 @@ class ModelRunner:
924
1020
 
925
1021
  def init_attention_backend(self):
926
1022
  """Init attention kernel backend."""
1023
+ if self.server_args.enable_two_batch_overlap:
1024
+ self.attn_backend = TboAttnBackend.init_new(self._get_attention_backend)
1025
+ else:
1026
+ self.attn_backend = self._get_attention_backend()
1027
+
1028
+ # TODO unify with 6338
1029
+ def _get_attention_backend(self):
927
1030
  if self.server_args.attention_backend == "flashinfer":
928
1031
  if not self.use_mla_backend:
929
1032
  from sglang.srt.layers.attention.flashinfer_backend import (
@@ -933,18 +1036,18 @@ class ModelRunner:
933
1036
  # Init streams
934
1037
  if self.server_args.speculative_algorithm == "EAGLE":
935
1038
  self.plan_stream_for_flashinfer = torch.cuda.Stream()
936
- self.attn_backend = FlashInferAttnBackend(self)
1039
+ return FlashInferAttnBackend(self)
937
1040
  else:
938
1041
  from sglang.srt.layers.attention.flashinfer_mla_backend import (
939
1042
  FlashInferMLAAttnBackend,
940
1043
  )
941
1044
 
942
- self.attn_backend = FlashInferMLAAttnBackend(self)
1045
+ return FlashInferMLAAttnBackend(self)
1046
+ elif self.server_args.attention_backend == "aiter":
1047
+ from sglang.srt.layers.attention.aiter_backend import AiterAttnBackend
1048
+
1049
+ return AiterAttnBackend(self)
943
1050
  elif self.server_args.attention_backend == "triton":
944
- assert self.sliding_window_size is None, (
945
- "Window attention is not supported in the triton attention backend. "
946
- "Please use `--attention-backend flashinfer`."
947
- )
948
1051
  assert not self.model_config.is_encoder_decoder, (
949
1052
  "Cross attention is not supported in the triton attention backend. "
950
1053
  "Please use `--attention-backend flashinfer`."
@@ -954,21 +1057,21 @@ class ModelRunner:
954
1057
  DoubleSparseAttnBackend,
955
1058
  )
956
1059
 
957
- self.attn_backend = DoubleSparseAttnBackend(self)
1060
+ return DoubleSparseAttnBackend(self)
958
1061
  else:
959
1062
  from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
960
1063
 
961
- self.attn_backend = TritonAttnBackend(self)
1064
+ return TritonAttnBackend(self)
962
1065
  elif self.server_args.attention_backend == "torch_native":
963
1066
  from sglang.srt.layers.attention.torch_native_backend import (
964
1067
  TorchNativeAttnBackend,
965
1068
  )
966
1069
 
967
- self.attn_backend = TorchNativeAttnBackend(self)
1070
+ return TorchNativeAttnBackend(self)
968
1071
  elif self.server_args.attention_backend == "flashmla":
969
1072
  from sglang.srt.layers.attention.flashmla_backend import FlashMLABackend
970
1073
 
971
- self.attn_backend = FlashMLABackend(self)
1074
+ return FlashMLABackend(self)
972
1075
  elif self.server_args.attention_backend == "fa3":
973
1076
  assert (
974
1077
  torch.cuda.get_device_capability()[0] == 8 and not self.use_mla_backend
@@ -980,13 +1083,20 @@ class ModelRunner:
980
1083
  FlashAttentionBackend,
981
1084
  )
982
1085
 
983
- self.attn_backend = FlashAttentionBackend(self)
1086
+ return FlashAttentionBackend(self)
984
1087
  elif self.server_args.attention_backend == "cutlass_mla":
985
1088
  from sglang.srt.layers.attention.cutlass_mla_backend import (
986
1089
  CutlassMLABackend,
987
1090
  )
988
1091
 
989
- self.attn_backend = CutlassMLABackend(self)
1092
+ return CutlassMLABackend(self)
1093
+ elif self.server_args.attention_backend == "intel_amx":
1094
+ from sglang.srt.layers.attention.intel_amx_backend import (
1095
+ IntelAMXAttnBackend,
1096
+ )
1097
+
1098
+ logger.info(f"Intel AMX attention backend is enabled.")
1099
+ return IntelAMXAttnBackend(self)
990
1100
  else:
991
1101
  raise ValueError(
992
1102
  f"Invalid attention backend: {self.server_args.attention_backend}"
@@ -1020,7 +1130,7 @@ class ModelRunner:
1020
1130
  if self.server_args.disable_cuda_graph:
1021
1131
  return
1022
1132
 
1023
- tic = time.time()
1133
+ tic = time.perf_counter()
1024
1134
  before_mem = get_available_gpu_memory(self.device, self.gpu_id)
1025
1135
  logger.info(
1026
1136
  f"Capture cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
@@ -1028,13 +1138,12 @@ class ModelRunner:
1028
1138
  self.cuda_graph_runner = CudaGraphRunner(self)
1029
1139
  after_mem = get_available_gpu_memory(self.device, self.gpu_id)
1030
1140
  logger.info(
1031
- f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f} s. "
1141
+ f"Capture cuda graph end. Time elapsed: {time.perf_counter() - tic:.2f} s. "
1032
1142
  f"mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB."
1033
1143
  )
1034
1144
 
1035
1145
  def apply_torch_tp(self):
1036
- if self.should_log:
1037
- logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
1146
+ logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
1038
1147
  from sglang.srt.model_parallel import tensor_parallel
1039
1148
 
1040
1149
  device_mesh = torch.distributed.init_device_mesh(self.device, (self.tp_size,))
@@ -1093,6 +1202,27 @@ class ModelRunner:
1093
1202
  forward_batch: ForwardBatch,
1094
1203
  skip_attn_backend_init: bool = False,
1095
1204
  pp_proxy_tensors: Optional[PPProxyTensors] = None,
1205
+ ) -> Tuple[Union[LogitsProcessorOutput, PPProxyTensors], bool]:
1206
+ self.forward_pass_id += 1
1207
+
1208
+ with get_global_expert_distribution_recorder().with_forward_pass(
1209
+ self.forward_pass_id,
1210
+ forward_batch,
1211
+ ):
1212
+ output = self._forward_raw(
1213
+ forward_batch, skip_attn_backend_init, pp_proxy_tensors
1214
+ )
1215
+
1216
+ if self.eplb_manager is not None:
1217
+ self.eplb_manager.on_forward_pass_end()
1218
+
1219
+ return output
1220
+
1221
+ def _forward_raw(
1222
+ self,
1223
+ forward_batch: ForwardBatch,
1224
+ skip_attn_backend_init: bool,
1225
+ pp_proxy_tensors: Optional[PPProxyTensors],
1096
1226
  ) -> Tuple[Union[LogitsProcessorOutput, PPProxyTensors], bool]:
1097
1227
  can_run_cuda_graph = bool(
1098
1228
  forward_batch.forward_mode.is_cuda_graph()
@@ -1171,7 +1301,7 @@ class ModelRunner:
1171
1301
  def model_is_mrope(self) -> bool:
1172
1302
  """Detect if the model has "mrope" rope_scaling type.
1173
1303
  mrope requires keep "rope_deltas" between prompt and decoding phases."""
1174
- rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {})
1304
+ rope_scaling = getattr(self.model_config.hf_text_config, "rope_scaling", {})
1175
1305
  if rope_scaling is None:
1176
1306
  return False
1177
1307
  is_mrope_enabled = "mrope_section" in rope_scaling
@@ -197,6 +197,15 @@ class DefaultModelLoader(BaseModelLoader):
197
197
  fall_back_to_pt: bool = True
198
198
  """Whether .pt weights can be used."""
199
199
 
200
+ @classmethod
201
+ def init_new(cls, model_config: ModelConfig, model):
202
+ return cls(
203
+ model_config.model_path,
204
+ model_config.revision,
205
+ prefix="",
206
+ fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True),
207
+ )
208
+
200
209
  def __init__(self, load_config: LoadConfig):
201
210
  super().__init__(load_config)
202
211
  if load_config.model_loader_extra_config:
@@ -341,12 +350,7 @@ class DefaultModelLoader(BaseModelLoader):
341
350
  model: nn.Module,
342
351
  ) -> Generator[Tuple[str, torch.Tensor], None, None]:
343
352
 
344
- primary_weights = DefaultModelLoader.Source(
345
- model_config.model_path,
346
- model_config.revision,
347
- prefix="",
348
- fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True),
349
- )
353
+ primary_weights = DefaultModelLoader.Source.init_new(model_config, model)
350
354
  yield from self._get_weights_iterator(primary_weights)
351
355
 
352
356
  secondary_weights = cast(