sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -60,6 +60,7 @@ class ServerArgs:
60
60
  is_embedding: bool = False
61
61
  enable_multimodal: Optional[bool] = None
62
62
  revision: Optional[str] = None
63
+ impl: str = "auto"
63
64
 
64
65
  # Port for the HTTP server
65
66
  host: str = "127.0.0.1"
@@ -89,6 +90,7 @@ class ServerArgs:
89
90
  download_dir: Optional[str] = None
90
91
  base_gpu_id: int = 0
91
92
  gpu_id_step: int = 1
93
+ sleep_on_idle: bool = False
92
94
 
93
95
  # Logging
94
96
  log_level: str = "info"
@@ -110,14 +112,12 @@ class ServerArgs:
110
112
  file_storage_path: str = "sglang_storage"
111
113
  enable_cache_report: bool = False
112
114
  reasoning_parser: Optional[str] = None
115
+ tool_call_parser: Optional[str] = None
113
116
 
114
117
  # Data parallelism
115
118
  dp_size: int = 1
116
119
  load_balance_method: str = "round_robin"
117
120
 
118
- # Expert parallelism
119
- ep_size: int = 1
120
-
121
121
  # Multi-node distributed serving
122
122
  dist_init_addr: Optional[str] = None
123
123
  nnodes: int = 1
@@ -136,6 +136,7 @@ class ServerArgs:
136
136
  attention_backend: Optional[str] = None
137
137
  sampling_backend: Optional[str] = None
138
138
  grammar_backend: Optional[str] = None
139
+ mm_attention_backend: Optional[str] = None
139
140
 
140
141
  # Speculative decoding
141
142
  speculative_algorithm: Optional[str] = None
@@ -147,6 +148,26 @@ class ServerArgs:
147
148
  speculative_accept_threshold_acc: float = 1.0
148
149
  speculative_token_map: Optional[str] = None
149
150
 
151
+ # Expert parallelism
152
+ ep_size: int = 1
153
+ enable_ep_moe: bool = False
154
+ enable_deepep_moe: bool = False
155
+ deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
156
+ ep_num_redundant_experts: int = 0
157
+ ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
158
+ init_expert_location: str = "trivial"
159
+ enable_eplb: bool = False
160
+ eplb_algorithm: str = "auto"
161
+ eplb_rebalance_num_iterations: int = 1000
162
+ eplb_rebalance_layers_per_chunk: Optional[int] = None
163
+ expert_distribution_recorder_mode: Optional[
164
+ Literal["stat", "stat_approx", "per_pass", "per_token"]
165
+ ] = None
166
+ expert_distribution_recorder_buffer_size: Optional[int] = None
167
+ enable_expert_distribution_metrics: bool = False
168
+ deepep_config: Optional[str] = None
169
+ moe_dense_tp_size: Optional[int] = None
170
+
150
171
  # Double Sparsity
151
172
  enable_double_sparsity: bool = False
152
173
  ds_channel_config_path: Optional[str] = None
@@ -157,34 +178,24 @@ class ServerArgs:
157
178
 
158
179
  # Optimization/debug options
159
180
  disable_radix_cache: bool = False
181
+ cuda_graph_max_bs: Optional[int] = None
182
+ cuda_graph_bs: Optional[List[int]] = None
160
183
  disable_cuda_graph: bool = False
161
184
  disable_cuda_graph_padding: bool = False
185
+ enable_profile_cuda_graph: bool = False
162
186
  enable_nccl_nvls: bool = False
163
187
  enable_tokenizer_batch_encode: bool = False
164
188
  disable_outlines_disk_cache: bool = False
165
189
  disable_custom_all_reduce: bool = False
190
+ enable_mscclpp: bool = False
166
191
  disable_overlap_schedule: bool = False
192
+ disable_overlap_cg_plan: bool = False
167
193
  enable_mixed_chunk: bool = False
168
194
  enable_dp_attention: bool = False
169
195
  enable_dp_lm_head: bool = False
170
- enable_ep_moe: bool = False
171
- enable_deepep_moe: bool = False
172
- deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
173
- ep_num_redundant_experts: int = 0
174
- ep_dispatch_algorithm: Optional[Literal["static", "dynamic"]] = None
175
- init_expert_location: str = "trivial"
176
- enable_eplb: bool = False
177
- eplb_rebalance_num_iterations: int = 1000
178
- expert_distribution_recorder_mode: Optional[
179
- Literal["stat", "per_pass", "per_token"]
180
- ] = None
181
- expert_distribution_recorder_buffer_size: Optional[int] = None
182
- enable_expert_distribution_metrics: bool = False
183
- deepep_config: Optional[str] = None
196
+ enable_two_batch_overlap: bool = False
184
197
  enable_torch_compile: bool = False
185
198
  torch_compile_max_bs: int = 32
186
- cuda_graph_max_bs: Optional[int] = None
187
- cuda_graph_bs: Optional[List[int]] = None
188
199
  torchao_config: str = ""
189
200
  enable_nan_detection: bool = False
190
201
  enable_p2p_check: bool = False
@@ -195,29 +206,32 @@ class ServerArgs:
195
206
  enable_memory_saver: bool = False
196
207
  allow_auto_truncate: bool = False
197
208
  enable_custom_logit_processor: bool = False
198
- tool_call_parser: Optional[str] = None
199
209
  enable_hierarchical_cache: bool = False
200
210
  hicache_ratio: float = 2.0
201
211
  hicache_size: int = 0
202
212
  hicache_write_policy: str = "write_through_selective"
203
213
  flashinfer_mla_disable_ragged: bool = False
204
- warmups: Optional[str] = None
205
- moe_dense_tp_size: Optional[int] = None
206
- n_share_experts_fusion: int = 0
214
+ disable_shared_experts_fusion: bool = False
207
215
  disable_chunked_prefix_cache: bool = False
208
216
  disable_fast_image_processor: bool = False
209
- mm_attention_backend: Optional[str] = None
217
+ enable_return_hidden_states: bool = False
218
+ warmups: Optional[str] = None
210
219
 
211
220
  # Debug tensor dumps
212
221
  debug_tensor_dump_output_folder: Optional[str] = None
213
222
  debug_tensor_dump_input_file: Optional[str] = None
214
223
  debug_tensor_dump_inject: bool = False
224
+ debug_tensor_dump_prefill_only: bool = False
215
225
 
216
226
  # For PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
217
227
  disaggregation_mode: str = "null"
218
- disaggregation_bootstrap_port: int = 8998
219
228
  disaggregation_transfer_backend: str = "mooncake"
229
+ disaggregation_bootstrap_port: int = 8998
230
+ disaggregation_decode_tp: Optional[int] = None
231
+ disaggregation_decode_dp: Optional[int] = None
232
+ disaggregation_prefill_pp: Optional[int] = 1
220
233
  disaggregation_ib_device: Optional[str] = None
234
+ num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
221
235
  pdlb_url: Optional[str] = None
222
236
 
223
237
  def __post_init__(self):
@@ -243,40 +257,72 @@ class ServerArgs:
243
257
 
244
258
  gpu_mem = get_device_memory_capacity(self.device)
245
259
 
246
- # Set mem fraction static, which depends on the tensor parallelism size
260
+ # Set mem fraction static
247
261
  if self.mem_fraction_static is None:
248
- parallel_size = self.tp_size * self.pp_size
249
- if gpu_mem is not None and gpu_mem <= 81920:
250
- if parallel_size >= 16:
251
- self.mem_fraction_static = 0.79
252
- elif parallel_size >= 8:
253
- self.mem_fraction_static = 0.81
254
- elif parallel_size >= 4:
255
- self.mem_fraction_static = 0.85
256
- elif parallel_size >= 2:
257
- self.mem_fraction_static = 0.87
262
+ if gpu_mem is not None:
263
+ # GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
264
+ # mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
265
+
266
+ # We want mem_fraction_static to be as large as possible but still has enough room
267
+ # for activations and cuda graph buffers. We use the following heuristic to
268
+ # compute the needed size for activations and cuda graph buffers:
269
+ # - The size of the activation depends on the chunked_prefill_size and model size.
270
+ # - The size of cuda graph buffers depends on the cuda graph capture range and model size.
271
+ # For GPUs with more memory, we use a larger chunked_prefill_size and
272
+ # capture more cuda graphs, so they need to reserve more memory.
273
+ parallel_size = self.tp_size * self.pp_size
274
+
275
+ if gpu_mem < 20 * 1024:
276
+ # T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
277
+ reserved_mem = (2.8 + parallel_size / 10) * 1024
278
+ elif gpu_mem < 35 * 1024:
279
+ # A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
280
+ reserved_mem = (2.8 + parallel_size / 10) * 1024
281
+ elif gpu_mem < 90 * 1024:
282
+ # H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
283
+ reserved_mem = (9.5 + parallel_size / 2) * 1024
284
+ elif gpu_mem < 100 * 1024:
285
+ # H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
286
+ reserved_mem = (12 + parallel_size / 2) * 1024
287
+ elif gpu_mem < 160 * 1024:
288
+ # H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
289
+ reserved_mem = (12 + parallel_size / 2) * 1024
258
290
  else:
259
- self.mem_fraction_static = 0.88
291
+ # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
292
+ reserved_mem = 32 * 1024
293
+
294
+ if self.speculative_algorithm is not None:
295
+ # draft model and larger cuda graph buffers
296
+ reserved_mem += 2 * 1024
297
+ if self.enable_dp_attention:
298
+ reserved_mem += 4 * 1024
299
+
300
+ self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
260
301
  else:
261
302
  self.mem_fraction_static = 0.88
262
- if gpu_mem is not None and gpu_mem > 96 * 1024:
263
- mem_fraction = self.mem_fraction_static
264
- self.mem_fraction_static = min(
265
- mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
266
- (gpu_mem - 1024 * 18)
267
- / gpu_mem, # 15 GB + additional 3GB for cuda graph
268
- )
269
303
 
270
304
  # Set chunked prefill size, which depends on the gpu memory capacity
271
305
  if self.chunked_prefill_size is None:
272
- if gpu_mem is not None and gpu_mem < 25_000:
273
- self.chunked_prefill_size = 2048
274
- elif self.disaggregation_mode != "null":
275
- self.chunked_prefill_size = 16384
306
+ if gpu_mem is not None:
307
+ if gpu_mem < 35 * 1024: # A10, L40, 4090
308
+ self.chunked_prefill_size = 2048
309
+ elif gpu_mem < 160 * 1024: # H100, H200, A100, H20
310
+ self.chunked_prefill_size = 8192
311
+ else: # B200, MI300
312
+ self.chunked_prefill_size = 16384
276
313
  else:
277
- self.chunked_prefill_size = 8192
314
+ self.chunked_prefill_size = 4096
278
315
  assert self.chunked_prefill_size % self.page_size == 0
279
316
 
317
+ # Set cuda graph max batch size
318
+ if self.cuda_graph_max_bs is None:
319
+ # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
320
+ if gpu_mem is not None and gpu_mem < 35 * 1024:
321
+ if self.tp_size < 4:
322
+ self.cuda_graph_max_bs = 8
323
+ else:
324
+ self.cuda_graph_max_bs = 80
325
+
280
326
  assert self.moe_dense_tp_size in {
281
327
  1,
282
328
  None,
@@ -294,21 +340,17 @@ class ServerArgs:
294
340
  )
295
341
  self.page_size = 128
296
342
 
297
- # Set cuda graph max batch size
298
- if self.cuda_graph_max_bs is None:
299
- # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
300
- if gpu_mem is not None and gpu_mem < 25_000:
301
- if self.tp_size < 4:
302
- self.cuda_graph_max_bs = 8
303
- else:
304
- self.cuda_graph_max_bs = 80
305
-
306
343
  # Set kernel backends for hpu device
307
344
  if self.device == "hpu":
308
345
  self.attention_backend = "torch_native"
309
346
  self.sampling_backend = "pytorch"
310
347
 
311
348
  # Set kernel backends
349
+ if self.device == "cpu":
350
+ if self.attention_backend is None:
351
+ self.attention_backend = "intel_amx"
352
+ self.sampling_backend = "pytorch"
353
+
312
354
  if self.sampling_backend is None:
313
355
  self.sampling_backend = (
314
356
  "flashinfer" if is_flashinfer_available() else "pytorch"
@@ -365,12 +407,28 @@ class ServerArgs:
365
407
  "Pipeline parallelism is incompatible with overlap schedule."
366
408
  )
367
409
 
410
+ if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
411
+ self.expert_distribution_recorder_mode = "stat"
412
+ logger.info(
413
+ "EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
414
+ )
415
+
416
+ if (self.enable_eplb or (self.init_expert_location is not None)) and (
417
+ self.ep_dispatch_algorithm is None
418
+ ):
419
+ self.ep_dispatch_algorithm = "static"
420
+ logger.info(
421
+ "EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
422
+ )
423
+
424
+ if self.enable_expert_distribution_metrics and (
425
+ self.expert_distribution_recorder_mode is None
426
+ ):
427
+ self.expert_distribution_recorder_mode = "stat"
428
+
368
429
  if self.expert_distribution_recorder_buffer_size is None:
369
- # TODO pr-chain: enable this later
370
- # if (x := self.eplb_rebalance_num_iterations) is not None:
371
- # self.expert_distribution_recorder_buffer_size = x
372
- if False:
373
- pass
430
+ if (x := self.eplb_rebalance_num_iterations) is not None:
431
+ self.expert_distribution_recorder_buffer_size = x
374
432
  elif self.expert_distribution_recorder_mode is not None:
375
433
  self.expert_distribution_recorder_buffer_size = 1000
376
434
 
@@ -387,6 +445,12 @@ class ServerArgs:
387
445
  "Overlap scheduler is disabled because of using "
388
446
  "eagle speculative decoding."
389
447
  )
448
+ if self.enable_mixed_chunk:
449
+ self.enable_mixed_chunk = False
450
+ logger.warning(
451
+ "Mixed chunked prefill is disabled because of using "
452
+ "eagle speculative decoding."
453
+ )
390
454
 
391
455
  model_arch = get_model_arch(self)
392
456
 
@@ -409,7 +473,7 @@ class ServerArgs:
409
473
  self.speculative_num_steps,
410
474
  self.speculative_eagle_topk,
411
475
  self.speculative_num_draft_tokens,
412
- ) = auto_choose_speculative_params(model_arch)
476
+ ) = auto_choose_speculative_params(self)
413
477
 
414
478
  if self.page_size > 1 and self.speculative_eagle_topk > 1:
415
479
  self.speculative_eagle_topk = 1
@@ -444,12 +508,27 @@ class ServerArgs:
444
508
  self.triton_attention_num_kv_splits = 16
445
509
 
446
510
  # PD disaggregation
447
- if self.disaggregation_mode == "prefill":
448
- self.disable_cuda_graph = True
449
- logger.warning("Cuda graph is disabled for prefill server")
450
- elif self.disaggregation_mode == "decode":
511
+ if self.disaggregation_mode == "decode":
512
+ assert (
513
+ self.disaggregation_decode_tp is None
514
+ ), "Cannot set --disaggregation-decode-tp for the decode engine."
515
+ assert (
516
+ self.disaggregation_decode_dp is None
517
+ ), "Cannot set --disaggregation-decode-dp for the decode engine."
518
+
451
519
  self.disable_radix_cache = True
452
520
  logger.warning("KV cache is forced as chunk cache for decode server")
521
+ elif self.disaggregation_mode == "prefill":
522
+ if self.disaggregation_decode_tp is None:
523
+ self.disaggregation_decode_tp = self.tp_size
524
+ if self.disaggregation_decode_dp is None:
525
+ self.disaggregation_decode_dp = self.dp_size
526
+
527
+ self.disaggregation_prefill_pp = self.pp_size
528
+ self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
529
+
530
+ self.disable_cuda_graph = True
531
+ logger.warning("Cuda graph is disabled for prefill server")
453
532
 
454
533
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
455
534
  "1" if self.enable_torch_compile else "0"
@@ -459,6 +538,14 @@ class ServerArgs:
459
538
  "1" if self.disable_outlines_disk_cache else "0"
460
539
  )
461
540
 
541
+ def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
542
+ larger_tp = max(decode_tp, prefill_tp)
543
+ smaller_tp = min(decode_tp, prefill_tp)
544
+ assert larger_tp % smaller_tp == 0, (
545
+ "Different tp size is supported only when one tp is multiple of the other. "
546
+ f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
547
+ )
548
+
462
549
  @staticmethod
463
550
  def add_cli_args(parser: argparse.ArgumentParser):
464
551
  # Model and port args
@@ -475,10 +562,16 @@ class ServerArgs:
475
562
  help="The path of the tokenizer.",
476
563
  )
477
564
  parser.add_argument(
478
- "--host", type=str, default=ServerArgs.host, help="The host of the server."
565
+ "--host",
566
+ type=str,
567
+ default=ServerArgs.host,
568
+ help="The host of the HTTP server.",
479
569
  )
480
570
  parser.add_argument(
481
- "--port", type=int, default=ServerArgs.port, help="The port of the server."
571
+ "--port",
572
+ type=int,
573
+ default=ServerArgs.port,
574
+ help="The port of the HTTP server.",
482
575
  )
483
576
  parser.add_argument(
484
577
  "--tokenizer-mode",
@@ -633,6 +726,18 @@ class ServerArgs:
633
726
  "name, a tag name, or a commit id. If unspecified, will use "
634
727
  "the default version.",
635
728
  )
729
+ parser.add_argument(
730
+ "--impl",
731
+ type=str,
732
+ default=ServerArgs.impl,
733
+ help="Which implementation of the model to use.\n\n"
734
+ '* "auto" will try to use the SGLang implementation if it exists '
735
+ "and fall back to the Transformers implementation if no SGLang "
736
+ "implementation is available.\n"
737
+ '* "sglang" will use the SGLang model implementation.\n'
738
+ '* "transformers" will use the Transformers model '
739
+ "implementation.\n",
740
+ )
636
741
 
637
742
  # Memory and scheduling
638
743
  parser.add_argument(
@@ -766,6 +871,11 @@ class ServerArgs:
766
871
  default=ServerArgs.gpu_id_step,
767
872
  help="The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...",
768
873
  )
874
+ parser.add_argument(
875
+ "--sleep-on-idle",
876
+ action="store_true",
877
+ help="Reduce CPU usage when sglang is idle.",
878
+ )
769
879
 
770
880
  # Logging
771
881
  parser.add_argument(
@@ -873,6 +983,13 @@ class ServerArgs:
873
983
  default=ServerArgs.reasoning_parser,
874
984
  help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
875
985
  )
986
+ parser.add_argument(
987
+ "--tool-call-parser",
988
+ type=str,
989
+ choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
990
+ default=ServerArgs.tool_call_parser,
991
+ help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
992
+ )
876
993
 
877
994
  # Data parallelism
878
995
  parser.add_argument(
@@ -893,15 +1010,6 @@ class ServerArgs:
893
1010
  ],
894
1011
  )
895
1012
 
896
- # Expert parallelism
897
- parser.add_argument(
898
- "--expert-parallel-size",
899
- "--ep-size",
900
- type=int,
901
- default=ServerArgs.ep_size,
902
- help="The expert parallelism size.",
903
- )
904
-
905
1013
  # Multi-node distributed serving
906
1014
  parser.add_argument(
907
1015
  "--dist-init-addr",
@@ -957,12 +1065,13 @@ class ServerArgs:
957
1065
  type=str,
958
1066
  choices=[
959
1067
  "aiter",
960
- "flashinfer",
961
- "triton",
962
- "torch_native",
1068
+ "cutlass_mla",
963
1069
  "fa3",
1070
+ "flashinfer",
964
1071
  "flashmla",
965
- "cutlass_mla",
1072
+ "intel_amx",
1073
+ "torch_native",
1074
+ "triton",
966
1075
  ],
967
1076
  default=ServerArgs.attention_backend,
968
1077
  help="Choose the kernels for attention layers.",
@@ -981,21 +1090,6 @@ class ServerArgs:
981
1090
  default=ServerArgs.grammar_backend,
982
1091
  help="Choose the backend for grammar-guided decoding.",
983
1092
  )
984
- parser.add_argument(
985
- "--enable-flashinfer-mla",
986
- action=DeprecatedAction,
987
- help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
988
- )
989
- parser.add_argument(
990
- "--enable-flashmla",
991
- action=DeprecatedAction,
992
- help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
993
- )
994
- parser.add_argument(
995
- "--flashinfer-mla-disable-ragged",
996
- action="store_true",
997
- help="Not using ragged prefill wrapper when running flashinfer mla",
998
- )
999
1093
 
1000
1094
  # Speculative decoding
1001
1095
  parser.add_argument(
@@ -1045,6 +1139,109 @@ class ServerArgs:
1045
1139
  help="The path of the draft model's small vocab table.",
1046
1140
  default=ServerArgs.speculative_token_map,
1047
1141
  )
1142
+ parser.add_argument(
1143
+ "--mm-attention-backend",
1144
+ type=str,
1145
+ choices=["sdpa", "fa3", "triton_attn"],
1146
+ default=ServerArgs.mm_attention_backend,
1147
+ help="Set multimodal attention backend.",
1148
+ )
1149
+
1150
+ # Expert parallelism
1151
+ parser.add_argument(
1152
+ "--expert-parallel-size",
1153
+ "--ep-size",
1154
+ type=int,
1155
+ default=ServerArgs.ep_size,
1156
+ help="The expert parallelism size.",
1157
+ )
1158
+ parser.add_argument(
1159
+ "--enable-ep-moe",
1160
+ action="store_true",
1161
+ help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
1162
+ )
1163
+ parser.add_argument(
1164
+ "--enable-deepep-moe",
1165
+ action="store_true",
1166
+ help="Enabling DeepEP MoE implementation for EP MoE.",
1167
+ )
1168
+ parser.add_argument(
1169
+ "--deepep-mode",
1170
+ type=str,
1171
+ choices=["normal", "low_latency", "auto"],
1172
+ default="auto",
1173
+ help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
1174
+ )
1175
+ parser.add_argument(
1176
+ "--ep-num-redundant-experts",
1177
+ type=int,
1178
+ default=ServerArgs.ep_num_redundant_experts,
1179
+ help="Allocate this number of redundant experts in expert parallel.",
1180
+ )
1181
+ parser.add_argument(
1182
+ "--ep-dispatch-algorithm",
1183
+ type=str,
1184
+ default=ServerArgs.ep_dispatch_algorithm,
1185
+ help="The algorithm to choose ranks for redundant experts in expert parallel.",
1186
+ )
1187
+ parser.add_argument(
1188
+ "--init-expert-location",
1189
+ type=str,
1190
+ default=ServerArgs.init_expert_location,
1191
+ help="Initial location of EP experts.",
1192
+ )
1193
+ parser.add_argument(
1194
+ "--enable-eplb",
1195
+ action="store_true",
1196
+ help="Enable EPLB algorithm",
1197
+ )
1198
+ parser.add_argument(
1199
+ "--eplb-algorithm",
1200
+ type=str,
1201
+ default=ServerArgs.eplb_algorithm,
1202
+ help="Chosen EPLB algorithm",
1203
+ )
1204
+ parser.add_argument(
1205
+ "--eplb-rebalance-num-iterations",
1206
+ type=int,
1207
+ default=ServerArgs.eplb_rebalance_num_iterations,
1208
+ help="Number of iterations to automatically trigger a EPLB re-balance.",
1209
+ )
1210
+ parser.add_argument(
1211
+ "--eplb-rebalance-layers-per-chunk",
1212
+ type=int,
1213
+ default=ServerArgs.eplb_rebalance_layers_per_chunk,
1214
+ help="Number of layers to rebalance per forward pass.",
1215
+ )
1216
+ parser.add_argument(
1217
+ "--expert-distribution-recorder-mode",
1218
+ type=str,
1219
+ default=ServerArgs.expert_distribution_recorder_mode,
1220
+ help="Mode of expert distribution recorder.",
1221
+ )
1222
+ parser.add_argument(
1223
+ "--expert-distribution-recorder-buffer-size",
1224
+ type=int,
1225
+ default=ServerArgs.expert_distribution_recorder_buffer_size,
1226
+ help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
1227
+ )
1228
+ parser.add_argument(
1229
+ "--enable-expert-distribution-metrics",
1230
+ action="store_true",
1231
+ help="Enable logging metrics for expert balancedness",
1232
+ )
1233
+ parser.add_argument(
1234
+ "--deepep-config",
1235
+ type=str,
1236
+ default=ServerArgs.deepep_config,
1237
+ help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
1238
+ )
1239
+ parser.add_argument(
1240
+ "--moe-dense-tp-size",
1241
+ type=int,
1242
+ default=ServerArgs.moe_dense_tp_size,
1243
+ help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
1244
+ )
1048
1245
 
1049
1246
  # Double Sparsity
1050
1247
  parser.add_argument(
@@ -1089,6 +1286,18 @@ class ServerArgs:
1089
1286
  action="store_true",
1090
1287
  help="Disable RadixAttention for prefix caching.",
1091
1288
  )
1289
+ parser.add_argument(
1290
+ "--cuda-graph-max-bs",
1291
+ type=int,
1292
+ default=ServerArgs.cuda_graph_max_bs,
1293
+ help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
1294
+ )
1295
+ parser.add_argument(
1296
+ "--cuda-graph-bs",
1297
+ type=int,
1298
+ nargs="+",
1299
+ help="Set the list of batch sizes for cuda graph.",
1300
+ )
1092
1301
  parser.add_argument(
1093
1302
  "--disable-cuda-graph",
1094
1303
  action="store_true",
@@ -1099,6 +1308,11 @@ class ServerArgs:
1099
1308
  action="store_true",
1100
1309
  help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
1101
1310
  )
1311
+ parser.add_argument(
1312
+ "--enable-profile-cuda-graph",
1313
+ action="store_true",
1314
+ help="Enable profiling of cuda graph capture.",
1315
+ )
1102
1316
  parser.add_argument(
1103
1317
  "--enable-nccl-nvls",
1104
1318
  action="store_true",
@@ -1119,11 +1333,21 @@ class ServerArgs:
1119
1333
  action="store_true",
1120
1334
  help="Disable the custom all-reduce kernel and fall back to NCCL.",
1121
1335
  )
1336
+ parser.add_argument(
1337
+ "--enable-mscclpp",
1338
+ action="store_true",
1339
+ help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
1340
+ )
1122
1341
  parser.add_argument(
1123
1342
  "--disable-overlap-schedule",
1124
1343
  action="store_true",
1125
1344
  help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
1126
1345
  )
1346
+ parser.add_argument(
1347
+ "--disable-overlap-cg-plan",
1348
+ action="store_true",
1349
+ help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
1350
+ )
1127
1351
  parser.add_argument(
1128
1352
  "--enable-mixed-chunk",
1129
1353
  action="store_true",
@@ -1140,9 +1364,9 @@ class ServerArgs:
1140
1364
  help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
1141
1365
  )
1142
1366
  parser.add_argument(
1143
- "--enable-ep-moe",
1367
+ "--enable-two-batch-overlap",
1144
1368
  action="store_true",
1145
- help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
1369
+ help="Enabling two micro batches to overlap.",
1146
1370
  )
1147
1371
  parser.add_argument(
1148
1372
  "--enable-torch-compile",
@@ -1155,18 +1379,6 @@ class ServerArgs:
1155
1379
  default=ServerArgs.torch_compile_max_bs,
1156
1380
  help="Set the maximum batch size when using torch compile.",
1157
1381
  )
1158
- parser.add_argument(
1159
- "--cuda-graph-max-bs",
1160
- type=int,
1161
- default=ServerArgs.cuda_graph_max_bs,
1162
- help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
1163
- )
1164
- parser.add_argument(
1165
- "--cuda-graph-bs",
1166
- type=int,
1167
- nargs="+",
1168
- help="Set the list of batch sizes for cuda graph.",
1169
- )
1170
1382
  parser.add_argument(
1171
1383
  "--torchao-config",
1172
1384
  type=str,
@@ -1223,13 +1435,6 @@ class ServerArgs:
1223
1435
  action="store_true",
1224
1436
  help="Enable users to pass custom logit processors to the server (disabled by default for security)",
1225
1437
  )
1226
- parser.add_argument(
1227
- "--tool-call-parser",
1228
- type=str,
1229
- choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
1230
- default=ServerArgs.tool_call_parser,
1231
- help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
1232
- )
1233
1438
  parser.add_argument(
1234
1439
  "--enable-hierarchical-cache",
1235
1440
  action="store_true",
@@ -1255,82 +1460,14 @@ class ServerArgs:
1255
1460
  help="The write policy of hierarchical cache.",
1256
1461
  )
1257
1462
  parser.add_argument(
1258
- "--enable-deepep-moe",
1259
- action="store_true",
1260
- help="Enabling DeepEP MoE implementation for EP MoE.",
1261
- )
1262
- parser.add_argument(
1263
- "--moe-dense-tp-size",
1264
- type=int,
1265
- default=ServerArgs.moe_dense_tp_size,
1266
- help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
1267
- )
1268
- parser.add_argument(
1269
- "--deepep-mode",
1270
- type=str,
1271
- choices=["normal", "low_latency", "auto"],
1272
- default="auto",
1273
- help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
1274
- )
1275
- parser.add_argument(
1276
- "--ep-num-redundant-experts",
1277
- type=int,
1278
- default=ServerArgs.ep_num_redundant_experts,
1279
- help="Allocate this number of redundant experts in expert parallel.",
1280
- )
1281
- parser.add_argument(
1282
- "--ep-dispatch-algorithm",
1283
- type=str,
1284
- default=ServerArgs.ep_dispatch_algorithm,
1285
- help="The algorithm to choose ranks for redundant experts in expert parallel.",
1286
- )
1287
- parser.add_argument(
1288
- "--init-expert-location",
1289
- type=str,
1290
- default=ServerArgs.init_expert_location,
1291
- help="Initial location of EP experts.",
1292
- )
1293
- parser.add_argument(
1294
- "--enable-eplb",
1463
+ "--flashinfer-mla-disable-ragged",
1295
1464
  action="store_true",
1296
- help="Enable EPLB algorithm",
1297
- )
1298
- parser.add_argument(
1299
- "--eplb-rebalance-num-iterations",
1300
- type=int,
1301
- default=ServerArgs.eplb_rebalance_num_iterations,
1302
- help="Number of iterations to automatically trigger a EPLB re-balance.",
1303
- )
1304
- parser.add_argument(
1305
- "--expert-distribution-recorder-mode",
1306
- type=str,
1307
- default=ServerArgs.expert_distribution_recorder_mode,
1308
- help="Mode of expert distribution recorder.",
1309
- )
1310
- parser.add_argument(
1311
- "--expert-distribution-recorder-buffer-size",
1312
- type=int,
1313
- default=ServerArgs.expert_distribution_recorder_buffer_size,
1314
- help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
1465
+ help="Not using ragged prefill wrapper when running flashinfer mla",
1315
1466
  )
1316
1467
  parser.add_argument(
1317
- "--enable-expert-distribution-metrics",
1468
+ "--disable-shared-experts-fusion",
1318
1469
  action="store_true",
1319
- help="Enable logging metrics for expert balancedness",
1320
- )
1321
- parser.add_argument(
1322
- "--deepep-config",
1323
- type=str,
1324
- default=ServerArgs.deepep_config,
1325
- help="Tuned DeepEP config suitable for your own cluster.",
1326
- )
1327
-
1328
- parser.add_argument(
1329
- "--n-share-experts-fusion",
1330
- type=int,
1331
- default=0,
1332
- help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
1333
- "set it to tp_size can get best optimized performance. Note that for architectures with SM==90, we have enabled the shared experts fusion optimization by default for DeepSeek V3/R1, with n_share_experts_fusion automatically set to the TP size.",
1470
+ help="Disable shared experts fusion optimization for deepseek v3/r1.",
1334
1471
  )
1335
1472
  parser.add_argument(
1336
1473
  "--disable-chunked-prefix-cache",
@@ -1342,8 +1479,11 @@ class ServerArgs:
1342
1479
  action="store_true",
1343
1480
  help="Adopt base image processor instead of fast image processor.",
1344
1481
  )
1345
-
1346
- # Server warmups
1482
+ parser.add_argument(
1483
+ "--enable-return-hidden-states",
1484
+ action="store_true",
1485
+ help="Enable returning hidden states with responses.",
1486
+ )
1347
1487
  parser.add_argument(
1348
1488
  "--warmups",
1349
1489
  type=str,
@@ -1371,6 +1511,11 @@ class ServerArgs:
1371
1511
  default=ServerArgs.debug_tensor_dump_inject,
1372
1512
  help="Inject the outputs from jax as the input of every layer.",
1373
1513
  )
1514
+ parser.add_argument(
1515
+ "--debug-tensor-dump-prefill-only",
1516
+ action="store_true",
1517
+ help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
1518
+ )
1374
1519
 
1375
1520
  # Disaggregation
1376
1521
  parser.add_argument(
@@ -1380,6 +1525,13 @@ class ServerArgs:
1380
1525
  choices=["null", "prefill", "decode"],
1381
1526
  help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
1382
1527
  )
1528
+ parser.add_argument(
1529
+ "--disaggregation-transfer-backend",
1530
+ type=str,
1531
+ default=ServerArgs.disaggregation_transfer_backend,
1532
+ choices=["mooncake", "nixl"],
1533
+ help="The backend for disaggregation transfer. Default is mooncake.",
1534
+ )
1383
1535
  parser.add_argument(
1384
1536
  "--disaggregation-bootstrap-port",
1385
1537
  type=int,
@@ -1387,11 +1539,22 @@ class ServerArgs:
1387
1539
  help="Bootstrap server port on the prefill server. Default is 8998.",
1388
1540
  )
1389
1541
  parser.add_argument(
1390
- "--disaggregation-transfer-backend",
1391
- type=str,
1392
- default=ServerArgs.disaggregation_transfer_backend,
1393
- choices=["mooncake", "nixl"],
1394
- help="The backend for disaggregation transfer. Default is mooncake.",
1542
+ "--disaggregation-decode-tp",
1543
+ type=int,
1544
+ default=ServerArgs.disaggregation_decode_tp,
1545
+ help="Decode tp size. If not set, it matches the tp size of the current engine. This is only set on the prefill server.",
1546
+ )
1547
+ parser.add_argument(
1548
+ "--disaggregation-decode-dp",
1549
+ type=int,
1550
+ default=ServerArgs.disaggregation_decode_dp,
1551
+ help="Decode dp size. If not set, it matches the dp size of the current engine. This is only set on the prefill server.",
1552
+ )
1553
+ parser.add_argument(
1554
+ "--disaggregation-prefill-pp",
1555
+ type=int,
1556
+ default=ServerArgs.disaggregation_prefill_pp,
1557
+ help="Prefill pp size. If not set, it is default to 1. This is only set on the decode server.",
1395
1558
  )
1396
1559
  parser.add_argument(
1397
1560
  "--disaggregation-ib-device",
@@ -1401,6 +1564,12 @@ class ServerArgs:
1401
1564
  "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
1402
1565
  "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
1403
1566
  )
1567
+ parser.add_argument(
1568
+ "--num-reserved-decode-tokens",
1569
+ type=int,
1570
+ default=ServerArgs.num_reserved_decode_tokens,
1571
+ help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
1572
+ )
1404
1573
  parser.add_argument(
1405
1574
  "--pdlb-url",
1406
1575
  type=str,
@@ -1408,14 +1577,6 @@ class ServerArgs:
1408
1577
  help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
1409
1578
  )
1410
1579
 
1411
- parser.add_argument(
1412
- "--mm-attention-backend",
1413
- type=str,
1414
- choices=["sdpa", "fa3", "triton_attn"],
1415
- default=ServerArgs.mm_attention_backend,
1416
- help="Set multimodal attention backend.",
1417
- )
1418
-
1419
1580
  @classmethod
1420
1581
  def from_cli_args(cls, args: argparse.Namespace):
1421
1582
  args.tp_size = args.tensor_parallel_size
@@ -1451,7 +1612,7 @@ class ServerArgs:
1451
1612
  self.max_loras_per_batch > 0
1452
1613
  # FIXME
1453
1614
  and (self.lora_paths is None or self.disable_radix_cache)
1454
- ), "compatibility of lora and cuda graph and radix attention is in progress"
1615
+ ), "compatibility of lora and radix attention is in progress"
1455
1616
  assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
1456
1617
  assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
1457
1618
 
@@ -1585,18 +1746,29 @@ def get_model_arch(args: ServerArgs):
1585
1746
  return hf_config.architectures[0]
1586
1747
 
1587
1748
 
1588
- def auto_choose_speculative_params(arch: str):
1749
+ def auto_choose_speculative_params(self: ServerArgs):
1589
1750
  """
1590
1751
  Automatically choose the parameters for speculative decoding.
1591
1752
 
1592
1753
  You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
1593
1754
  """
1755
+ kwargs = {}
1756
+
1757
+ hf_config = get_config(
1758
+ self.model_path,
1759
+ trust_remote_code=self.trust_remote_code,
1760
+ revision=self.revision,
1761
+ model_override_args=json.loads(self.json_model_override_args),
1762
+ **kwargs,
1763
+ )
1764
+ arch = hf_config.architectures[0]
1765
+
1594
1766
  if arch in ["LlamaForCausalLM"]:
1595
1767
  # The default value for llama
1596
1768
  return (5, 4, 8)
1597
1769
  elif arch in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
1598
1770
  # The default value for deepseek
1599
- return (5, 4, 8)
1771
+ return (3, 1, 4)
1600
1772
  elif arch in ["Grok1ForCausalLM", "Grok1VForCausalLM"]:
1601
1773
  return (5, 4, 8)
1602
1774
  else: