sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -19,12 +19,11 @@ import json
19
19
  import logging
20
20
  import os
21
21
  import random
22
- import sys
23
22
  import tempfile
24
- from typing import List, Literal, Optional, Union
23
+ from typing import Dict, List, Literal, Optional, Union
25
24
 
25
+ from sglang.srt.connector import ConnectorType
26
26
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
27
- from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
28
27
  from sglang.srt.lora.lora_registry import LoRARef
29
28
  from sglang.srt.parser.reasoning_parser import ReasoningParser
30
29
  from sglang.srt.utils import (
@@ -36,18 +35,22 @@ from sglang.srt.utils import (
36
35
  is_cuda,
37
36
  is_flashinfer_available,
38
37
  is_hip,
38
+ is_npu,
39
39
  is_port_available,
40
40
  is_remote_url,
41
41
  is_sm90_supported,
42
42
  is_sm100_supported,
43
43
  is_triton_kernels_available,
44
44
  is_valid_ipv6_address,
45
+ json_list_type,
45
46
  nullable_str,
47
+ parse_connector_type,
46
48
  )
49
+ from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
50
+ from sglang.utils import is_in_ci
47
51
 
48
52
  logger = logging.getLogger(__name__)
49
53
 
50
-
51
54
  # Define constants
52
55
  LOAD_FORMAT_CHOICES = [
53
56
  "auto",
@@ -60,6 +63,7 @@ LOAD_FORMAT_CHOICES = [
60
63
  "bitsandbytes",
61
64
  "layered",
62
65
  "remote",
66
+ "remote_instance",
63
67
  ]
64
68
 
65
69
  QUANTIZATION_CHOICES = [
@@ -86,9 +90,12 @@ ATTENTION_BACKEND_CHOICES = [
86
90
  # Common
87
91
  "triton",
88
92
  "torch_native",
93
+ "flex_attention",
94
+ "nsa",
89
95
  # NVIDIA specific
90
96
  "cutlass_mla",
91
97
  "fa3",
98
+ "fa4",
92
99
  "flashinfer",
93
100
  "flashmla",
94
101
  "trtllm_mla",
@@ -102,8 +109,29 @@ ATTENTION_BACKEND_CHOICES = [
102
109
  "ascend",
103
110
  ]
104
111
 
112
+ LORA_BACKEND_CHOICES = ["triton", "csgmv"]
113
+
105
114
  DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
106
115
 
116
+ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
117
+
118
+ DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
119
+
120
+ NSA_CHOICES = ["flashmla_prefill", "flashmla_decode", "fa3", "tilelang", "aiter"]
121
+
122
+ RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
123
+
124
+ MOE_RUNNER_BACKEND_CHOICES = [
125
+ "auto",
126
+ "deep_gemm",
127
+ "triton",
128
+ "triton_kernel",
129
+ "flashinfer_trtllm",
130
+ "flashinfer_cutlass",
131
+ "flashinfer_mxfp4",
132
+ "flashinfer_cutedsl",
133
+ ]
134
+
107
135
 
108
136
  # Allow external code to add more choices
109
137
  def add_load_format_choices(choices):
@@ -122,6 +150,22 @@ def add_disagg_transfer_backend_choices(choices):
122
150
  DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
123
151
 
124
152
 
153
+ def add_grammar_backend_choices(choices):
154
+ GRAMMAR_BACKEND_CHOICES.extend(choices)
155
+
156
+
157
+ def add_moe_runner_backend_choices(choices):
158
+ MOE_RUNNER_BACKEND_CHOICES.extend(choices)
159
+
160
+
161
+ def add_deterministic_attention_backend_choices(choices):
162
+ DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
163
+
164
+
165
+ def add_radix_eviction_policy_choices(choices):
166
+ RADIX_EVICTION_POLICY_CHOICES.extend(choices)
167
+
168
+
125
169
  @dataclasses.dataclass
126
170
  class ServerArgs:
127
171
  # Model and tokenizer
@@ -133,6 +177,7 @@ class ServerArgs:
133
177
  load_format: str = "auto"
134
178
  model_loader_extra_config: str = "{}"
135
179
  trust_remote_code: bool = False
180
+ modelopt_quant: Optional[Union[str, Dict]] = None
136
181
  context_length: Optional[int] = None
137
182
  is_embedding: bool = False
138
183
  enable_multimodal: Optional[bool] = None
@@ -151,26 +196,31 @@ class ServerArgs:
151
196
  quantization: Optional[str] = None
152
197
  quantization_param_path: Optional[str] = None
153
198
  kv_cache_dtype: str = "auto"
199
+ enable_fp32_lm_head: bool = False
154
200
 
155
201
  # Memory and scheduling
156
202
  mem_fraction_static: Optional[float] = None
157
203
  max_running_requests: Optional[int] = None
158
- max_queued_requests: Optional[int] = sys.maxsize
204
+ max_queued_requests: Optional[int] = None
159
205
  max_total_tokens: Optional[int] = None
160
206
  chunked_prefill_size: Optional[int] = None
161
207
  max_prefill_tokens: int = 16384
162
208
  schedule_policy: str = "fcfs"
209
+ enable_priority_scheduling: bool = False
210
+ schedule_low_priority_values_first: bool = False
211
+ priority_scheduling_preemption_threshold: int = 10
163
212
  schedule_conservativeness: float = 1.0
164
213
  page_size: Optional[int] = None
165
214
  hybrid_kvcache_ratio: Optional[float] = None
166
215
  swa_full_tokens_ratio: float = 0.8
167
216
  disable_hybrid_swa_memory: bool = False
217
+ radix_eviction_policy: str = "lru"
168
218
 
169
219
  # Runtime options
170
220
  device: Optional[str] = None
171
221
  tp_size: int = 1
172
222
  pp_size: int = 1
173
- max_micro_batch_size: Optional[int] = None
223
+ pp_max_micro_batch_size: Optional[int] = None
174
224
  stream_interval: int = 1
175
225
  stream_output: bool = False
176
226
  random_seed: Optional[int] = None
@@ -191,6 +241,8 @@ class ServerArgs:
191
241
  show_time_cost: bool = False
192
242
  enable_metrics: bool = False
193
243
  enable_metrics_for_all_schedulers: bool = False
244
+ tokenizer_metrics_custom_labels_header: str = "x-custom-labels"
245
+ tokenizer_metrics_allowed_custom_labels: Optional[List[str]] = None
194
246
  bucket_time_to_first_token: Optional[List[float]] = None
195
247
  bucket_inter_token_latency: Optional[List[float]] = None
196
248
  bucket_e2e_request_latency: Optional[List[float]] = None
@@ -201,6 +253,8 @@ class ServerArgs:
201
253
  enable_request_time_stats_logging: bool = False
202
254
  kv_events_config: Optional[str] = None
203
255
  gc_warning_threshold_secs: float = 0.0
256
+ enable_trace: bool = False
257
+ oltp_traces_endpoint: str = "localhost:4317"
204
258
 
205
259
  # API related
206
260
  api_key: Optional[str] = None
@@ -213,10 +267,14 @@ class ServerArgs:
213
267
  reasoning_parser: Optional[str] = None
214
268
  tool_call_parser: Optional[str] = None
215
269
  tool_server: Optional[str] = None
270
+ sampling_defaults: str = "model"
216
271
 
217
272
  # Data parallelism
218
273
  dp_size: int = 1
219
274
  load_balance_method: str = "round_robin"
275
+ load_watch_interval: float = 0.1
276
+ # FIXME: remove this after dp rank scheduling is fully supported with PD-Disaggregation
277
+ prefill_round_robin_balance: bool = False
220
278
 
221
279
  # Multi-node distributed serving
222
280
  dist_init_addr: Optional[str] = None
@@ -237,6 +295,7 @@ class ServerArgs:
237
295
  max_loaded_loras: Optional[int] = None
238
296
  max_loras_per_batch: int = 8
239
297
  lora_backend: str = "triton"
298
+ max_lora_chunk_size: Optional[int] = 16
240
299
 
241
300
  # Kernel backend
242
301
  attention_backend: Optional[str] = None
@@ -245,28 +304,33 @@ class ServerArgs:
245
304
  sampling_backend: Optional[str] = None
246
305
  grammar_backend: Optional[str] = None
247
306
  mm_attention_backend: Optional[str] = None
307
+ nsa_prefill: str = "flashmla_prefill"
308
+ nsa_decode: str = "fa3"
248
309
 
249
310
  # Speculative decoding
250
311
  speculative_algorithm: Optional[str] = None
251
312
  speculative_draft_model_path: Optional[str] = None
313
+ speculative_draft_model_revision: Optional[str] = None
252
314
  speculative_num_steps: Optional[int] = None
253
315
  speculative_eagle_topk: Optional[int] = None
254
316
  speculative_num_draft_tokens: Optional[int] = None
255
317
  speculative_accept_threshold_single: float = 1.0
256
318
  speculative_accept_threshold_acc: float = 1.0
257
319
  speculative_token_map: Optional[str] = None
320
+ speculative_attention_mode: str = "prefill"
321
+ # For ngram only
322
+ speculative_ngram_min_match_window_size: int = 1
323
+ speculative_ngram_max_match_window_size: int = 12
324
+ speculative_ngram_min_bfs_breadth: int = 1
325
+ speculative_ngram_max_bfs_breadth: int = 10
326
+ speculative_ngram_match_type: Literal["BFS", "PROB"] = "BFS"
327
+ speculative_ngram_branch_length: int = 18
328
+ speculative_ngram_capacity: int = 10 * 1000 * 1000
258
329
 
259
330
  # Expert parallelism
260
331
  ep_size: int = 1
261
332
  moe_a2a_backend: Literal["none", "deepep"] = "none"
262
- moe_runner_backend: Literal[
263
- "auto",
264
- "triton",
265
- "triton_kernel",
266
- "flashinfer_trtllm",
267
- "flashinfer_cutlass",
268
- "flashinfer_mxfp4",
269
- ] = "auto"
333
+ moe_runner_backend: str = "auto"
270
334
  flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
271
335
  enable_flashinfer_allreduce_fusion: bool = False
272
336
  deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
@@ -286,6 +350,10 @@ class ServerArgs:
286
350
  deepep_config: Optional[str] = None
287
351
  moe_dense_tp_size: Optional[int] = None
288
352
 
353
+ # Mamba cache
354
+ max_mamba_cache_size: Optional[int] = None
355
+ mamba_ssm_dtype: str = "float32"
356
+
289
357
  # Hierarchical cache
290
358
  enable_hierarchical_cache: bool = False
291
359
  hicache_ratio: float = 2.0
@@ -296,6 +364,8 @@ class ServerArgs:
296
364
  hicache_storage_backend: Optional[str] = None
297
365
  hicache_storage_prefetch_policy: str = "best_effort"
298
366
  hicache_storage_backend_extra_config: Optional[str] = None
367
+ # LMCache
368
+ enable_lmcache: bool = False
299
369
 
300
370
  # Double Sparsity
301
371
  enable_double_sparsity: bool = False
@@ -312,6 +382,12 @@ class ServerArgs:
312
382
  offload_prefetch_step: int = 1
313
383
  offload_mode: str = "cpu"
314
384
 
385
+ # Scoring configuration
386
+ # Delimiter token ID used to combine Query and Items into a single sequence for multi-item scoring.
387
+ # Format: Query<delimiter>Item1<delimiter>Item2<delimiter>...
388
+ # This enables efficient batch processing of multiple items against a single query.
389
+ multi_item_scoring_delimiter: Optional[Union[int]] = None
390
+
315
391
  # Optimization/debug options
316
392
  disable_radix_cache: bool = False
317
393
  cuda_graph_max_bs: Optional[int] = None
@@ -327,11 +403,13 @@ class ServerArgs:
327
403
  disable_outlines_disk_cache: bool = False
328
404
  disable_custom_all_reduce: bool = False
329
405
  enable_mscclpp: bool = False
406
+ enable_torch_symm_mem: bool = False
330
407
  disable_overlap_schedule: bool = False
331
408
  enable_mixed_chunk: bool = False
332
409
  enable_dp_attention: bool = False
333
410
  enable_dp_lm_head: bool = False
334
411
  enable_two_batch_overlap: bool = False
412
+ enable_single_batch_overlap: bool = False
335
413
  tbo_token_distribution_threshold: float = 0.48
336
414
  enable_torch_compile: bool = False
337
415
  torch_compile_max_bs: int = 32
@@ -340,17 +418,27 @@ class ServerArgs:
340
418
  enable_p2p_check: bool = False
341
419
  triton_attention_reduce_in_fp32: bool = False
342
420
  triton_attention_num_kv_splits: int = 8
421
+ triton_attention_split_tile_size: Optional[int] = None
343
422
  num_continuous_decode_steps: int = 1
344
423
  delete_ckpt_after_loading: bool = False
345
424
  enable_memory_saver: bool = False
425
+ enable_weights_cpu_backup: bool = False
346
426
  allow_auto_truncate: bool = False
347
427
  enable_custom_logit_processor: bool = False
348
428
  flashinfer_mla_disable_ragged: bool = False
349
429
  disable_shared_experts_fusion: bool = False
350
430
  disable_chunked_prefix_cache: bool = False
351
431
  disable_fast_image_processor: bool = False
432
+ keep_mm_feature_on_device: bool = False
352
433
  enable_return_hidden_states: bool = False
353
434
  scheduler_recv_interval: int = 1
435
+ numa_node: Optional[List[int]] = None
436
+ enable_deterministic_inference: bool = False
437
+
438
+ # Dynamic batch tokenizer
439
+ enable_dynamic_batch_tokenizer: bool = False
440
+ dynamic_batch_tokenizer_batch_size: int = 32
441
+ dynamic_batch_tokenizer_batch_timeout: float = 0.002
354
442
 
355
443
  # Debug tensor dumps
356
444
  debug_tensor_dump_output_folder: Optional[str] = None
@@ -359,66 +447,124 @@ class ServerArgs:
359
447
  debug_tensor_dump_prefill_only: bool = False
360
448
 
361
449
  # PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
362
- disaggregation_mode: str = "null"
450
+ disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
363
451
  disaggregation_transfer_backend: str = "mooncake"
364
452
  disaggregation_bootstrap_port: int = 8998
365
453
  disaggregation_decode_tp: Optional[int] = None
366
454
  disaggregation_decode_dp: Optional[int] = None
367
455
  disaggregation_prefill_pp: Optional[int] = 1
368
456
  disaggregation_ib_device: Optional[str] = None
457
+ disaggregation_decode_enable_offload_kvcache: bool = False
369
458
  num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
370
- pdlb_url: Optional[str] = None
459
+ # FIXME: hack to reduce ITL when decode bs is small
460
+ disaggregation_decode_polling_interval: int = 1
371
461
 
372
- # For model weight update
462
+ # For model weight update and weight loading
373
463
  custom_weight_loader: Optional[List[str]] = None
374
464
  weight_loader_disable_mmap: bool = False
465
+ remote_instance_weight_loader_seed_instance_ip: Optional[str] = None
466
+ remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
467
+ remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None
375
468
 
376
469
  # For PD-Multiplexing
377
470
  enable_pdmux: bool = False
378
471
  sm_group_num: int = 3
379
472
 
380
- # Deprecated arguments
381
- enable_ep_moe: bool = False
382
- enable_deepep_moe: bool = False
383
- enable_flashinfer_cutlass_moe: bool = False
384
- enable_flashinfer_trtllm_moe: bool = False
385
- enable_triton_kernel_moe: bool = False
386
- enable_flashinfer_mxfp4_moe: bool = False
473
+ def get_attention_backends(server_args):
474
+ prefill_attention_backend_str = (
475
+ server_args.prefill_attention_backend
476
+ if server_args.prefill_attention_backend
477
+ else server_args.attention_backend
478
+ )
479
+ decode_attention_backend_str = (
480
+ server_args.decode_attention_backend
481
+ if server_args.decode_attention_backend
482
+ else server_args.attention_backend
483
+ )
484
+ return prefill_attention_backend_str, decode_attention_backend_str
387
485
 
388
486
  def __post_init__(self):
389
- # Check deprecated arguments
390
- if self.enable_ep_moe:
391
- self.ep_size = self.tp_size
392
- print_deprecated_warning(
393
- "NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
394
- )
395
- if self.enable_deepep_moe:
396
- self.moe_a2a_backend = "deepep"
397
- print_deprecated_warning(
398
- "NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
399
- )
400
- if self.enable_triton_kernel_moe:
401
- self.moe_runner_backend = "triton_kernel"
402
- print_deprecated_warning(
403
- "NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
404
- )
405
- if self.enable_flashinfer_cutlass_moe:
406
- self.moe_runner_backend = "flashinfer_cutlass"
407
- print_deprecated_warning(
408
- "NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead."
409
- )
410
- if self.enable_flashinfer_trtllm_moe:
411
- self.moe_runner_backend = "flashinfer_trtllm"
412
- print_deprecated_warning(
413
- "NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead."
414
- )
415
- if self.enable_flashinfer_mxfp4_moe:
416
- self.moe_runner_backend = "flashinfer_mxfp4"
417
- print_deprecated_warning(
418
- "NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead."
487
+ """
488
+ Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
489
+ """
490
+ # Handle deprecated arguments.
491
+ self._handle_deprecated_args()
492
+
493
+ # Set missing default values.
494
+ self._handle_missing_default_values()
495
+
496
+ # Get GPU memory capacity, which is a common dependency for several configuration steps.
497
+ gpu_mem = get_device_memory_capacity(self.device)
498
+
499
+ # Handle memory-related, chunked prefill, and CUDA graph batch size configurations.
500
+ self._handle_gpu_memory_settings(gpu_mem)
501
+
502
+ # Handle device-specific backends.
503
+ self._handle_hpu_backends()
504
+ self._handle_cpu_backends()
505
+
506
+ # Apply model-specific adjustments.
507
+ self._handle_model_specific_adjustments()
508
+
509
+ # Set kernel backends.
510
+ self._handle_sampling_backend()
511
+ self._handle_attention_backend_compatibility()
512
+ self._handle_page_size()
513
+ self._handle_amd_specifics()
514
+ self._handle_grammar_backend()
515
+
516
+ # Handle data parallelism.
517
+ self._handle_data_parallelism()
518
+
519
+ # Handle MoE configurations.
520
+ self._handle_moe_kernel_config()
521
+ self._handle_deepep_moe()
522
+ self._handle_eplb_and_dispatch()
523
+ self._handle_expert_distribution_metrics()
524
+
525
+ # Handle pipeline parallelism.
526
+ self._handle_pipeline_parallelism()
527
+
528
+ # Handle Hicache settings.
529
+ self._handle_hicache()
530
+
531
+ # Handle speculative decoding logic.
532
+ self._handle_speculative_decoding()
533
+
534
+ # Handle model loading format.
535
+ self._handle_load_format()
536
+
537
+ # Handle PD disaggregation.
538
+ self._handle_disaggregation()
539
+
540
+ # Validate tokenizer settings.
541
+ self._handle_tokenizer_batching()
542
+
543
+ # Propagate environment variables.
544
+ self._handle_environment_variables()
545
+
546
+ # Validate cache settings.
547
+ self._handle_cache_compatibility()
548
+
549
+ # Validate metrics labels.
550
+ self._handle_metrics_labels()
551
+
552
+ # Handle deterministic inference.
553
+ self._handle_deterministic_inference()
554
+
555
+ # Handle any other necessary validations.
556
+ self._handle_other_validations()
557
+
558
+ def _handle_deprecated_args(self):
559
+ # handle deprecated tool call parsers
560
+ deprecated_tool_call_parsers = {"qwen25": "qwen", "glm45": "glm"}
561
+ if self.tool_call_parser in deprecated_tool_call_parsers:
562
+ logger.warning(
563
+ f"The tool_call_parser '{self.tool_call_parser}' is deprecated. Please use '{deprecated_tool_call_parsers[self.tool_call_parser]}' instead."
419
564
  )
565
+ self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]
420
566
 
421
- # Set missing default values
567
+ def _handle_missing_default_values(self):
422
568
  if self.tokenizer_path is None:
423
569
  self.tokenizer_path = self.model_path
424
570
  if self.served_model_name is None:
@@ -428,51 +574,140 @@ class ServerArgs:
428
574
  if self.random_seed is None:
429
575
  self.random_seed = random.randint(0, 1 << 30)
430
576
 
431
- gpu_mem = get_device_memory_capacity(self.device)
577
+ def _handle_gpu_memory_settings(self, gpu_mem):
578
+ """
579
+ Configure GPU memory-dependent settings including
580
+ chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static.
581
+
582
+ Here are our heuristics:
583
+ - Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity.
584
+ This is because GPUs with more memory are generally more powerful, we need to use a larger
585
+ chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU.
586
+ - Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs.
587
+
588
+ GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
589
+
590
+ The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity,
591
+ or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity.
592
+
593
+ In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers.
594
+ The activation memory is proportional to the chunked_prefill_size.
595
+ The cuda graph memory is proportional to the cuda_graph_max_bs.
596
+ We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB.
597
+ and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity.
598
+
599
+ The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run.
600
+ """
601
+ if gpu_mem is not None:
602
+ if gpu_mem < 20 * 1024:
603
+ # T4, 4080
604
+ # (chunked_prefill_size 2k, cuda_graph_max_bs 8)
605
+ if self.chunked_prefill_size is None:
606
+ self.chunked_prefill_size = 2048
607
+ if self.cuda_graph_max_bs is None:
608
+ self.cuda_graph_max_bs = 8
609
+ elif gpu_mem < 35 * 1024:
610
+ # A10, 4090, 5090
611
+ # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
612
+ if self.chunked_prefill_size is None:
613
+ self.chunked_prefill_size = 2048
614
+ if self.cuda_graph_max_bs is None:
615
+ # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM < 35GB, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
616
+ # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
617
+ # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
618
+ if self.tp_size < 4:
619
+ self.cuda_graph_max_bs = 16
620
+ else:
621
+ self.cuda_graph_max_bs = 80
622
+ elif gpu_mem < 60 * 1024:
623
+ # A100 (40GB), L40,
624
+ # (chunked_prefill_size 4k, cuda_graph_max_bs 32 if tp < 4 else 160)
625
+ if self.chunked_prefill_size is None:
626
+ self.chunked_prefill_size = 4096
627
+ if self.cuda_graph_max_bs is None:
628
+ if self.tp_size < 4:
629
+ self.cuda_graph_max_bs = 32
630
+ else:
631
+ self.cuda_graph_max_bs = 160
632
+ elif gpu_mem < 90 * 1024:
633
+ # H100, A100
634
+ # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
635
+ if self.chunked_prefill_size is None:
636
+ self.chunked_prefill_size = 8192
637
+ if self.cuda_graph_max_bs is None:
638
+ if self.tp_size < 4:
639
+ self.cuda_graph_max_bs = 256
640
+ else:
641
+ self.cuda_graph_max_bs = 512
642
+ elif gpu_mem < 160 * 1024:
643
+ # H20, H200
644
+ # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
645
+ if self.chunked_prefill_size is None:
646
+ self.chunked_prefill_size = 8192
647
+ if self.cuda_graph_max_bs is None:
648
+ if self.tp_size < 4:
649
+ self.cuda_graph_max_bs = 256
650
+ else:
651
+ self.cuda_graph_max_bs = 512
652
+ else:
653
+ # B200, MI300
654
+ # (chunked_prefill_size 16k, cuda_graph_max_bs 512)
655
+ if self.chunked_prefill_size is None:
656
+ self.chunked_prefill_size = 16384
657
+ if self.cuda_graph_max_bs is None:
658
+ self.cuda_graph_max_bs = 512
659
+ else:
660
+ # Fallback defaults when gpu_mem is None
661
+ if self.chunked_prefill_size is None:
662
+ self.chunked_prefill_size = 4096
663
+ if self.cuda_graph_max_bs is None:
664
+ self.cuda_graph_max_bs = 160
432
665
 
433
- # Set mem fraction static
434
- if self.mem_fraction_static is None:
435
- if gpu_mem is not None:
436
- # GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
437
- # mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
438
-
439
- # We want mem_fraction_static to be as large as possible but still has enough room
440
- # for activations and cuda graph buffers. We use the following heuristic to
441
- # compute the needed size for activations and cuda graph buffers:
442
- # - The size of the activation depends on the chunked_prefill_size and model size.
443
- # - The size of cuda graph buffers depends on the cuda graph capture range and model size.
444
- # For GPUs with more memory, we use a larger chunked_prefill_size and
445
- # capture more cuda graphs, so they need to reserve more memory.
446
- parallel_size = self.tp_size * self.pp_size
447
-
448
- if gpu_mem < 20 * 1024:
449
- # T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
450
- reserved_mem = (2.8 + parallel_size / 10) * 1024
451
- elif gpu_mem < 35 * 1024:
452
- # A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
453
- reserved_mem = (2.8 + parallel_size / 10) * 1024
454
- elif gpu_mem < 90 * 1024:
455
- # H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
456
- reserved_mem = (9.5 + parallel_size / 2) * 1024
457
- elif gpu_mem < 100 * 1024:
458
- # H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
459
- reserved_mem = (12 + parallel_size / 2) * 1024
460
- elif gpu_mem < 160 * 1024:
461
- # H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
462
- reserved_mem = (12 + parallel_size / 2) * 1024
463
- else:
464
- # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
465
- reserved_mem = 32 * 1024
666
+ # Set cuda graph batch sizes
667
+ if self.cuda_graph_bs is None:
668
+ self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes()
669
+ else:
670
+ self.cuda_graph_max_bs = max(self.cuda_graph_bs)
466
671
 
467
- if self.speculative_algorithm is not None:
468
- # draft model and larger cuda graph buffers
672
+ if self.mem_fraction_static is None:
673
+ # Constant meta data (e.g., from attention backend)
674
+ reserved_mem = 512
675
+ # For activation during large prefill
676
+ if self.chunked_prefill_size > 0:
677
+ reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
678
+ else:
679
+ reserved_mem += max(self.max_prefill_tokens, 2048) * 1.5
680
+ # For cuda graphs
681
+ reserved_mem += self.cuda_graph_max_bs * 2
682
+ # Some adjustments for large parallel size
683
+ reserved_mem += self.tp_size * self.pp_size / 8 * 1024
684
+
685
+ if self.enable_dp_attention:
686
+ # DP attention needs more padding for some operations
687
+ reserved_mem += self.cuda_graph_max_bs * self.dp_size * 3
688
+
689
+ # DP attention uses much more memory for large cuda graph max bs,
690
+ # likely due to some inefficiencies in torch allocator or our implementation.
691
+ # So we need to reserve more memory.
692
+ if self.cuda_graph_max_bs > 300:
693
+ reserved_mem += self.cuda_graph_max_bs * self.dp_size * 1.5
694
+
695
+ if gpu_mem is not None and gpu_mem > 60 * 1024:
696
+ reserved_mem = max(reserved_mem, 10 * 1024)
697
+
698
+ if self.speculative_algorithm is not None:
699
+ if self.speculative_algorithm == "STANDALONE":
700
+ # standalonedraft model and cuda graphs
701
+ reserved_mem += 6 * 1024
702
+ elif self.speculative_algorithm != "NGRAM":
703
+ # eagle draft models and cuda graphs
469
704
  reserved_mem += 2 * 1024
470
- if self.enable_dp_attention:
471
- reserved_mem += 4 * 1024
472
705
 
473
- self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
474
- else:
475
- self.mem_fraction_static = 0.88
706
+ self.mem_fraction_static = (
707
+ round((gpu_mem - reserved_mem) / gpu_mem, 3)
708
+ if gpu_mem is not None
709
+ else 0.88
710
+ )
476
711
 
477
712
  # Lazy init to avoid circular import
478
713
  # Multimodal models need more memory for the image processor
@@ -482,53 +717,197 @@ class ServerArgs:
482
717
  if model_config.is_multimodal:
483
718
  self.adjust_mem_fraction_for_vlm(model_config)
484
719
 
485
- # Set chunked prefill size, which depends on the gpu memory capacity
486
- if self.chunked_prefill_size is None:
487
- if gpu_mem is not None:
488
- if gpu_mem < 35 * 1024: # A10, L40, 4090
489
- self.chunked_prefill_size = 2048
490
- elif gpu_mem < 160 * 1024: # H100, H200, A100, H20
491
- self.chunked_prefill_size = 8192
492
- else: # B200, MI300
493
- self.chunked_prefill_size = 16384
494
- else:
495
- self.chunked_prefill_size = 4096
720
+ def _generate_cuda_graph_batch_sizes(self):
721
+ """
722
+ Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs.
723
+ This integrates the logic from cuda_graph_runner.py.
724
+ """
725
+ # Handle disable_cuda_graph_padding as the first condition for both spec and non-spec
726
+ if self.disable_cuda_graph_padding:
727
+ capture_bs = list(range(1, self.cuda_graph_max_bs + 1))
728
+ elif self.speculative_algorithm is None:
729
+ # Normal case: [1, 2, 4, 8, 12] + list(range(16, 257, 8)) + list(range(272, 512, 16)) + list(range(512, cuda_graph_max_bs + 1))
730
+ capture_bs = (
731
+ [1, 2, 4, 8, 12]
732
+ + list(range(16, 257, 8))
733
+ + list(range(272, 512, 16))
734
+ + list(range(512, self.cuda_graph_max_bs + 1, 32))
735
+ )
736
+ else:
737
+ # Spec decoding case: list(range(1, 9, 1)) + list(range(10, 33, 2)) + list(range(40, 64, 4)) + list(range(72, 257, 8))
738
+ capture_bs = (
739
+ list(range(1, 9, 1))
740
+ + list(range(10, 33, 2))
741
+ + list(range(40, 64, 4))
742
+ + list(range(72, 257, 8))
743
+ + list(range(272, self.cuda_graph_max_bs + 1, 16))
744
+ )
496
745
 
497
- # Set cuda graph max batch size
498
- if self.cuda_graph_max_bs is None:
499
- # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
500
- if gpu_mem is not None and gpu_mem < 35 * 1024:
501
- if self.tp_size < 4:
502
- self.cuda_graph_max_bs = 8
503
- else:
504
- self.cuda_graph_max_bs = 80
746
+ capture_bs = [bs for bs in capture_bs if bs <= self.cuda_graph_max_bs]
747
+
748
+ return capture_bs
505
749
 
506
- # Set kernel backends for hpu device
750
+ def _handle_hpu_backends(self):
507
751
  if self.device == "hpu":
508
752
  self.attention_backend = "torch_native"
509
753
  self.sampling_backend = "pytorch"
510
754
 
511
- # Model-specific adjustments
512
- self.model_specific_adjustments()
513
-
514
- # Set kernel backends
755
+ def _handle_cpu_backends(self):
515
756
  if self.device == "cpu":
516
757
  if self.attention_backend is None:
517
758
  self.attention_backend = "intel_amx"
518
759
  self.sampling_backend = "pytorch"
519
760
 
761
+ def _handle_model_specific_adjustments(self):
762
+ from sglang.srt.configs.model_config import is_deepseek_nsa
763
+
764
+ if parse_connector_type(self.model_path) == ConnectorType.INSTANCE:
765
+ return
766
+
767
+ hf_config = self.get_hf_config()
768
+ model_arch = hf_config.architectures[0]
769
+ if model_arch in ["GptOssForCausalLM"]:
770
+ if (
771
+ self.attention_backend is None
772
+ and self.prefill_attention_backend is None
773
+ and self.decode_attention_backend is None
774
+ ):
775
+ if is_cuda() and is_sm100_supported():
776
+ self.attention_backend = "trtllm_mha"
777
+ elif is_cuda() and is_sm90_supported():
778
+ self.attention_backend = "fa3"
779
+ else:
780
+ self.attention_backend = "triton"
781
+
782
+ supported_backends = ["triton", "trtllm_mha", "fa3", "fa4"]
783
+ prefill_attn_backend, decode_attn_backend = self.get_attention_backends()
784
+ assert (
785
+ prefill_attn_backend in supported_backends
786
+ and decode_attn_backend in supported_backends
787
+ ), (
788
+ f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got the following backends\n"
789
+ f"- Prefill: {prefill_attn_backend}\n"
790
+ f"- Decode: {decode_attn_backend}\n"
791
+ )
792
+
793
+ if is_sm100_supported():
794
+ if not self.enable_dp_attention:
795
+ self.enable_flashinfer_allreduce_fusion = True
796
+ logger.info(
797
+ "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
798
+ )
799
+ quantization_config = getattr(hf_config, "quantization_config", None)
800
+ is_mxfp4_quant_format = (
801
+ quantization_config is not None
802
+ and quantization_config.get("quant_method") == "mxfp4"
803
+ )
804
+
805
+ if is_sm100_supported() and is_mxfp4_quant_format:
806
+ self.moe_runner_backend = "flashinfer_mxfp4"
807
+ logger.warning(
808
+ "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
809
+ )
810
+ else:
811
+ if self.moe_runner_backend == "triton_kernel":
812
+ assert (
813
+ self.ep_size == 1
814
+ ), "Triton kernel MoE is only supported when ep_size == 1"
815
+ if (
816
+ self.moe_runner_backend == "auto"
817
+ and self.ep_size == 1
818
+ and is_triton_kernels_available()
819
+ ):
820
+ self.moe_runner_backend = "triton_kernel"
821
+ logger.warning(
822
+ "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
823
+ )
824
+ self.disable_hybrid_swa_memory = True
825
+ if is_mxfp4_quant_format:
826
+ # use bf16 for mxfp4 triton kernels
827
+ self.dtype = "bfloat16"
828
+
829
+ elif "Llama4" in model_arch and self.device != "cpu":
830
+ assert self.attention_backend in {
831
+ "fa3",
832
+ "aiter",
833
+ "triton",
834
+ }, "fa3, aiter, or triton is required for Llama4 model"
835
+ elif model_arch in [
836
+ "Gemma2ForCausalLM",
837
+ "Gemma3ForCausalLM",
838
+ "Gemma3ForConditionalGeneration",
839
+ "Gemma3nForCausalLM",
840
+ "Gemma3nForConditionalGeneration",
841
+ ]:
842
+ # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
843
+ # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
844
+ logger.warning(
845
+ f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
846
+ )
847
+ self.disable_hybrid_swa_memory = True
848
+
849
+ if is_deepseek_nsa(hf_config):
850
+ if (
851
+ self.attention_backend is None
852
+ and self.prefill_attention_backend is None
853
+ and self.decode_attention_backend is None
854
+ ):
855
+ self.attention_backend = "nsa"
856
+ logger.warning("Set nsa attention backend for DeepSeek NSA.")
857
+
858
+ if not is_npu():
859
+ self.enable_dp_attention = True
860
+ self.dp_size = self.tp_size
861
+ logger.warning("DP attention is enabled for DeepSeek NSA.")
862
+
863
+ self.page_size = 64
864
+ logger.warning("Setting page size to 64 for DeepSeek NSA.")
865
+
866
+ # For Hopper, we support both bf16 and fp8 kv cache; for Blackwell, we support fp8 only currently
867
+ import torch
868
+
869
+ major, _ = torch.cuda.get_device_capability()
870
+ if major >= 10:
871
+ self.kv_cache_dtype = "fp8_e4m3"
872
+ logger.warning("Setting KV cache dtype to fp8.")
873
+
874
+ if self.kv_cache_dtype == "fp8_e4m3":
875
+ self.nsa_prefill = "flashmla_decode"
876
+ self.nsa_decode = "flashmla_decode"
877
+ logger.warning(
878
+ "Setting NSA backend to flashmla_decode for FP8 KV Cache."
879
+ )
880
+
881
+ # Logging env vars for NSA
882
+ from sglang.srt.layers.attention.nsa.utils import (
883
+ print_nsa_bool_env_vars,
884
+ )
885
+
886
+ print_nsa_bool_env_vars()
887
+
888
+ def _handle_sampling_backend(self):
520
889
  if self.sampling_backend is None:
521
890
  self.sampling_backend = (
522
891
  "flashinfer" if is_flashinfer_available() else "pytorch"
523
892
  )
524
893
 
894
+ def _handle_attention_backend_compatibility(self):
525
895
  if self.attention_backend == "torch_native":
526
896
  logger.warning(
527
897
  "Cuda graph is disabled because of using torch native attention backend"
528
898
  )
529
899
  self.disable_cuda_graph = True
530
900
 
531
- if self.attention_backend == "ascend":
901
+ if self.attention_backend == "flex_attention":
902
+ logger.warning(
903
+ "Cuda graph is disabled because of using torch Flex Attention backend"
904
+ )
905
+ self.disable_cuda_graph = True
906
+ assert (
907
+ self.speculative_algorithm is None
908
+ ), "Speculative decoding is currently not supported with Flex Attention backend"
909
+
910
+ if is_npu() and self.attention_backend in ["ascend"]:
532
911
  logger.warning(
533
912
  "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
534
913
  )
@@ -590,30 +969,30 @@ class ServerArgs:
590
969
 
591
970
  if self.attention_backend == "dual_chunk_flash_attn":
592
971
  logger.warning(
593
- "Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
972
+ "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
594
973
  )
595
974
  self.enable_mixed_chunk = False
596
- self.disable_cuda_graph = True
597
975
  self.disable_radix_cache = True
598
976
 
599
- # Set page size
977
+ def _handle_page_size(self):
600
978
  if self.page_size is None:
601
979
  self.page_size = 1
602
980
 
603
- # AMD-specific Triton attention KV splits default number
981
+ def _handle_amd_specifics(self):
604
982
  if is_hip():
605
983
  self.triton_attention_num_kv_splits = 16
606
984
 
607
- # Choose grammar backend
985
+ def _handle_grammar_backend(self):
608
986
  if self.grammar_backend is None:
609
987
  self.grammar_backend = "xgrammar"
610
988
 
611
- # Data parallelism attention
989
+ def _handle_data_parallelism(self):
990
+ if self.dp_size == 1:
991
+ self.enable_dp_attention = False
992
+ self.enable_dp_lm_head = False
993
+
612
994
  if self.enable_dp_attention:
613
995
  self.schedule_conservativeness = self.schedule_conservativeness * 0.3
614
- assert (
615
- self.dp_size > 1
616
- ), "Please set a dp-size > 1. You can use 1 < dp-size <= tp-size "
617
996
  assert self.tp_size % self.dp_size == 0
618
997
  self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
619
998
  logger.warning(
@@ -625,7 +1004,7 @@ class ServerArgs:
625
1004
  self.enable_dp_attention
626
1005
  ), "Please enable dp attention when setting enable_dp_lm_head. "
627
1006
 
628
- # MoE kernel
1007
+ def _handle_moe_kernel_config(self):
629
1008
  if self.moe_runner_backend == "flashinfer_cutlass":
630
1009
  assert (
631
1010
  self.quantization == "modelopt_fp4"
@@ -636,13 +1015,15 @@ class ServerArgs:
636
1015
  ], "The expert parallel size must be 1 or the same as the tensor parallel size"
637
1016
 
638
1017
  if self.moe_runner_backend == "flashinfer_trtllm":
639
- if not self.disable_shared_experts_fusion:
640
- self.disable_shared_experts_fusion = True
641
- logger.warning(
642
- "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
643
- )
1018
+ assert (
1019
+ self.quantization == "modelopt_fp4" or self.quantization == "fp8"
1020
+ ), "modelopt_fp4 or fp8 quantization is required for Flashinfer TRTLLM MoE"
1021
+ self.disable_shared_experts_fusion = True
1022
+ logger.warning(
1023
+ "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
1024
+ )
644
1025
 
645
- # DeepEP MoE
1026
+ def _handle_deepep_moe(self):
646
1027
  if self.moe_a2a_backend == "deepep":
647
1028
  if self.deepep_mode == "normal":
648
1029
  logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
@@ -652,6 +1033,7 @@ class ServerArgs:
652
1033
  f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
653
1034
  )
654
1035
 
1036
+ def _handle_eplb_and_dispatch(self):
655
1037
  if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
656
1038
  self.expert_distribution_recorder_mode = "stat"
657
1039
  logger.warning(
@@ -666,6 +1048,7 @@ class ServerArgs:
666
1048
  if self.enable_eplb:
667
1049
  assert self.ep_size > 1
668
1050
 
1051
+ def _handle_expert_distribution_metrics(self):
669
1052
  if self.enable_expert_distribution_metrics and (
670
1053
  self.expert_distribution_recorder_mode is None
671
1054
  ):
@@ -677,25 +1060,42 @@ class ServerArgs:
677
1060
  elif self.expert_distribution_recorder_mode is not None:
678
1061
  self.expert_distribution_recorder_buffer_size = 1000
679
1062
 
680
- # Pipeline parallelism
1063
+ def _handle_pipeline_parallelism(self):
681
1064
  if self.pp_size > 1:
682
1065
  self.disable_overlap_schedule = True
683
1066
  logger.warning(
684
1067
  "Pipeline parallelism is incompatible with overlap schedule."
685
1068
  )
686
1069
 
687
- # Hicache
1070
+ def _handle_hicache(self):
688
1071
  if self.hicache_storage_backend == "mooncake":
689
- # to use mooncake storage backend, the following conditions must be met:
690
- self.hicache_io_backend = "kernel"
691
- self.hicache_mem_layout = "page_first"
1072
+ if self.hicache_mem_layout == "layer_first":
1073
+ if self.hicache_io_backend == "direct":
1074
+ self.hicache_mem_layout = "page_first_direct"
1075
+ elif self.hicache_io_backend == "kernel":
1076
+ self.hicache_mem_layout = "page_first"
1077
+ logger.warning(
1078
+ f"Mooncake storage backend does not support layer_first layout, "
1079
+ f"switching to {self.hicache_mem_layout} layout for {self.hicache_io_backend} io backend"
1080
+ )
1081
+
1082
+ if self.hicache_mem_layout == "page_first_direct":
1083
+ if self.hicache_io_backend != "direct":
1084
+ self.hicache_io_backend = "direct"
1085
+ logger.warning(
1086
+ "Page first direct layout only support direct io backend"
1087
+ )
692
1088
 
693
- # Speculative Decoding
1089
+ def _handle_speculative_decoding(self):
694
1090
  if self.speculative_algorithm == "NEXTN":
695
- # NEXTN shares the same implementation of EAGLE
696
1091
  self.speculative_algorithm = "EAGLE"
697
1092
 
698
- if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
1093
+ if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
1094
+ if self.speculative_algorithm == "STANDALONE" and self.enable_dp_attention:
1095
+ # TODO: support dp attention for standalone speculative decoding
1096
+ raise ValueError(
1097
+ "Currently standalone speculative decoding does not support dp attention."
1098
+ )
699
1099
  if self.max_running_requests is None:
700
1100
  self.max_running_requests = 48
701
1101
  self.disable_overlap_schedule = True
@@ -711,8 +1111,13 @@ class ServerArgs:
711
1111
  )
712
1112
 
713
1113
  model_arch = self.get_hf_config().architectures[0]
714
- if model_arch in ["DeepseekV3ForCausalLM", "Glm4MoeForCausalLM"]:
715
- # Auto set draft_model_path DeepSeek-V3/R1
1114
+ if model_arch in [
1115
+ "DeepseekV32ForCausalLM",
1116
+ "DeepseekV3ForCausalLM",
1117
+ "Glm4MoeForCausalLM",
1118
+ "BailingMoeForCausalLM",
1119
+ "BailingMoeV2ForCausalLM",
1120
+ ]:
716
1121
  if self.speculative_draft_model_path is None:
717
1122
  self.speculative_draft_model_path = self.model_path
718
1123
  else:
@@ -720,7 +1125,6 @@ class ServerArgs:
720
1125
  "DeepSeek MTP does not require setting speculative_draft_model_path."
721
1126
  )
722
1127
 
723
- # Auto choose parameters
724
1128
  if self.speculative_num_steps is None:
725
1129
  assert (
726
1130
  self.speculative_eagle_topk is None
@@ -760,23 +1164,63 @@ class ServerArgs:
760
1164
  "speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
761
1165
  )
762
1166
 
763
- # The token generated from the verify step is counted.
764
- # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
765
- # assert self.speculative_num_steps < self.speculative_num_draft_tokens
1167
+ if self.speculative_algorithm == "NGRAM":
1168
+ if not self.device.startswith("cuda"):
1169
+ raise ValueError(
1170
+ "Ngram speculative decoding only supports CUDA device."
1171
+ )
1172
+ if self.max_running_requests is None:
1173
+ self.max_running_requests = 48
1174
+ self.disable_overlap_schedule = True
1175
+ self.enable_mixed_chunk = False
1176
+ self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
1177
+ if self.speculative_num_draft_tokens is None:
1178
+ self.speculative_num_draft_tokens = (
1179
+ self.speculative_ngram_max_match_window_size
1180
+ )
1181
+ logger.warning(
1182
+ "The overlap scheduler and mixed chunked prefill are disabled because of "
1183
+ "using ngram speculative decoding."
1184
+ )
766
1185
 
767
- # GGUF
1186
+ if (
1187
+ self.speculative_eagle_topk > 1
1188
+ and self.page_size > 1
1189
+ and self.attention_backend != "flashinfer"
1190
+ ):
1191
+ raise ValueError(
1192
+ f"speculative_eagle_topk({self.speculative_eagle_topk}) > 1 "
1193
+ f"with page_size({self.page_size}) > 1 is unstable "
1194
+ "and produces incorrect results for paged attention backends. "
1195
+ "This combination is only supported for the 'flashinfer' backend."
1196
+ )
1197
+ if self.enable_dp_attention:
1198
+ # TODO: support dp attention for ngram speculative decoding
1199
+ raise ValueError(
1200
+ "Currently ngram speculative decoding does not support dp attention."
1201
+ )
1202
+
1203
+ def _handle_load_format(self):
768
1204
  if (
769
1205
  self.load_format == "auto" or self.load_format == "gguf"
770
1206
  ) and check_gguf_file(self.model_path):
771
1207
  self.quantization = self.load_format = "gguf"
772
1208
 
773
- # Model loading
774
1209
  if is_remote_url(self.model_path):
775
1210
  self.load_format = "remote"
1211
+
776
1212
  if self.custom_weight_loader is None:
777
1213
  self.custom_weight_loader = []
778
1214
 
779
- # PD disaggregation
1215
+ if self.load_format == "remote_instance":
1216
+ if (
1217
+ self.remote_instance_weight_loader_seed_instance_ip is None
1218
+ or self.remote_instance_weight_loader_seed_instance_service_port is None
1219
+ or self.remote_instance_weight_loader_send_weights_group_ports is None
1220
+ ):
1221
+ self.load_format = "auto"
1222
+
1223
+ def _handle_disaggregation(self):
780
1224
  if self.disaggregation_mode == "decode":
781
1225
  assert (
782
1226
  self.disaggregation_decode_tp is None
@@ -787,6 +1231,13 @@ class ServerArgs:
787
1231
 
788
1232
  self.disable_radix_cache = True
789
1233
  logger.warning("KV cache is forced as chunk cache for decode server")
1234
+
1235
+ if self.dp_size > 1 and not is_in_ci():
1236
+ assert self.prefill_round_robin_balance, (
1237
+ "Prefill round robin balance is required when dp size > 1. "
1238
+ "Please make sure that the prefill instance is launched with `--load-balance-method round_robin`"
1239
+ " and `--prefill-round-robin-balance` is set for decode server."
1240
+ )
790
1241
  elif self.disaggregation_mode == "prefill":
791
1242
  if self.disaggregation_decode_tp is None:
792
1243
  self.disaggregation_decode_tp = self.tp_size
@@ -795,25 +1246,84 @@ class ServerArgs:
795
1246
 
796
1247
  self.disaggregation_prefill_pp = self.pp_size
797
1248
  self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
798
-
799
1249
  self.disable_cuda_graph = True
800
1250
  logger.warning("Cuda graph is disabled for prefill server")
801
1251
 
802
- # Propagate env vars
1252
+ def _handle_tokenizer_batching(self):
1253
+ if self.enable_tokenizer_batch_encode and self.enable_dynamic_batch_tokenizer:
1254
+ raise ValueError(
1255
+ "Cannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. "
1256
+ "Please choose one tokenizer batching approach."
1257
+ )
1258
+
1259
+ def _handle_environment_variables(self):
803
1260
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
804
1261
  "1" if self.enable_torch_compile else "0"
805
1262
  )
806
- # Set env var before grammar backends init
1263
+ os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
807
1264
  os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
808
1265
  "1" if self.disable_outlines_disk_cache else "0"
809
1266
  )
1267
+ os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = (
1268
+ "1" if self.enable_deterministic_inference else "0"
1269
+ )
810
1270
 
1271
+ def _handle_cache_compatibility(self):
811
1272
  if self.enable_hierarchical_cache and self.disable_radix_cache:
812
1273
  raise ValueError(
813
1274
  "The arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive "
814
1275
  "and cannot be used at the same time. Please use only one of them."
815
1276
  )
816
1277
 
1278
+ if (
1279
+ self.disaggregation_decode_enable_offload_kvcache
1280
+ and self.disaggregation_mode != "decode"
1281
+ ):
1282
+ raise ValueError(
1283
+ "The argument disaggregation-decode-enable-offload-kvcache is only supported for decode side."
1284
+ )
1285
+
1286
+ def _handle_metrics_labels(self):
1287
+ if (
1288
+ not self.tokenizer_metrics_custom_labels_header
1289
+ and self.tokenizer_metrics_allowed_custom_labels
1290
+ ):
1291
+ raise ValueError(
1292
+ "Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-custom-labels."
1293
+ )
1294
+
1295
+ def _handle_deterministic_inference(self):
1296
+ if self.enable_deterministic_inference:
1297
+ # Check sampling backend
1298
+ self.sampling_backend = "pytorch"
1299
+ logger.warning(
1300
+ "Sampling backend is set to pytorch for deterministic inference."
1301
+ )
1302
+
1303
+ # Check attention backend
1304
+ if self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
1305
+ raise ValueError(
1306
+ f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference."
1307
+ )
1308
+
1309
+ # Currently, only FA3 supports radix cache. Support for other backends is in progress
1310
+ if self.attention_backend != "fa3":
1311
+ self.disable_radix_cache = True
1312
+ logger.warning(
1313
+ f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
1314
+ )
1315
+
1316
+ # Check TP size
1317
+ if self.tp_size > 1:
1318
+ os.environ["NCCL_ALGO"] = "allreduce:tree"
1319
+ self.disable_custom_all_reduce = True
1320
+ logger.warning(
1321
+ "NCCL_ALGO is set to 'allreduce:tree' and custom all reduce is disabled for deterministic inference when TP size > 1."
1322
+ )
1323
+
1324
+ def _handle_other_validations(self):
1325
+ pass
1326
+
817
1327
  @staticmethod
818
1328
  def add_cli_args(parser: argparse.ArgumentParser):
819
1329
  # Model and tokenizer
@@ -830,12 +1340,6 @@ class ServerArgs:
830
1340
  default=ServerArgs.tokenizer_path,
831
1341
  help="The path of the tokenizer.",
832
1342
  )
833
- parser.add_argument(
834
- "--tokenizer-worker-num",
835
- type=int,
836
- default=ServerArgs.tokenizer_worker_num,
837
- help="The worker num of the tokenizer manager.",
838
- )
839
1343
  parser.add_argument(
840
1344
  "--tokenizer-mode",
841
1345
  type=str,
@@ -845,6 +1349,12 @@ class ServerArgs:
845
1349
  "tokenizer if available, and 'slow' will "
846
1350
  "always use the slow tokenizer.",
847
1351
  )
1352
+ parser.add_argument(
1353
+ "--tokenizer-worker-num",
1354
+ type=int,
1355
+ default=ServerArgs.tokenizer_worker_num,
1356
+ help="The worker num of the tokenizer manager.",
1357
+ )
848
1358
  parser.add_argument(
849
1359
  "--skip-tokenizer-init",
850
1360
  action="store_true",
@@ -985,6 +1495,14 @@ class ServerArgs:
985
1495
  "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
986
1496
  "default to 1.0, which may cause accuracy issues. ",
987
1497
  )
1498
+ parser.add_argument(
1499
+ "--modelopt-quant",
1500
+ type=str,
1501
+ default=ServerArgs.modelopt_quant,
1502
+ help="The ModelOpt quantization configuration. "
1503
+ "Supported values: 'fp8', 'int4_awq', 'w4a8_awq', 'nvfp4', 'nvfp4_awq'. "
1504
+ "This requires the NVIDIA Model Optimizer library to be installed: pip install nvidia-modelopt",
1505
+ )
988
1506
  parser.add_argument(
989
1507
  "--kv-cache-dtype",
990
1508
  type=str,
@@ -992,6 +1510,11 @@ class ServerArgs:
992
1510
  choices=["auto", "fp8_e5m2", "fp8_e4m3"],
993
1511
  help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
994
1512
  )
1513
+ parser.add_argument(
1514
+ "--enable-fp32-lm-head",
1515
+ action="store_true",
1516
+ help="If set, the LM head outputs (logits) are in FP32.",
1517
+ )
995
1518
 
996
1519
  # Memory and scheduling
997
1520
  parser.add_argument(
@@ -1035,9 +1558,27 @@ class ServerArgs:
1035
1558
  "--schedule-policy",
1036
1559
  type=str,
1037
1560
  default=ServerArgs.schedule_policy,
1038
- choices=["lpm", "random", "fcfs", "dfs-weight", "lof"],
1561
+ choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
1039
1562
  help="The scheduling policy of the requests.",
1040
1563
  )
1564
+ parser.add_argument(
1565
+ "--enable-priority-scheduling",
1566
+ action="store_true",
1567
+ default=ServerArgs.enable_priority_scheduling,
1568
+ help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
1569
+ )
1570
+ parser.add_argument(
1571
+ "--schedule-low-priority-values-first",
1572
+ action="store_true",
1573
+ default=ServerArgs.schedule_low_priority_values_first,
1574
+ help="If specified with --enable-priority-scheduling, the scheduler will schedule requests with lower priority integer values first.",
1575
+ )
1576
+ parser.add_argument(
1577
+ "--priority-scheduling-preemption-threshold",
1578
+ type=int,
1579
+ default=ServerArgs.priority_scheduling_preemption_threshold,
1580
+ help="Minimum difference in priorities for an incoming request to have to preempt running request(s).",
1581
+ )
1041
1582
  parser.add_argument(
1042
1583
  "--schedule-conservativeness",
1043
1584
  type=float,
@@ -1097,9 +1638,9 @@ class ServerArgs:
1097
1638
  help="The pipeline parallelism size.",
1098
1639
  )
1099
1640
  parser.add_argument(
1100
- "--max-micro-batch-size",
1641
+ "--pp-max-micro-batch-size",
1101
1642
  type=int,
1102
- default=ServerArgs.max_micro_batch_size,
1643
+ default=ServerArgs.pp_max_micro_batch_size,
1103
1644
  help="The maximum micro batch size in pipeline parallelism.",
1104
1645
  )
1105
1646
  parser.add_argument(
@@ -1209,6 +1750,21 @@ class ServerArgs:
1209
1750
  "to record request metrics separately. This is especially useful when dp_attention is enabled, as "
1210
1751
  "otherwise all metrics appear to come from TP 0.",
1211
1752
  )
1753
+ parser.add_argument(
1754
+ "--tokenizer-metrics-custom-labels-header",
1755
+ type=str,
1756
+ default=ServerArgs.tokenizer_metrics_custom_labels_header,
1757
+ help="Specify the HTTP header for passing custom labels for tokenizer metrics.",
1758
+ )
1759
+ parser.add_argument(
1760
+ "--tokenizer-metrics-allowed-custom-labels",
1761
+ type=str,
1762
+ nargs="+",
1763
+ default=ServerArgs.tokenizer_metrics_allowed_custom_labels,
1764
+ help="The custom labels allowed for tokenizer metrics. The labels are specified via a dict in "
1765
+ "'--tokenizer-metrics-custom-labels-header' field in HTTP requests, e.g., {'label1': 'value1', 'label2': "
1766
+ "'value2'} is allowed if '--tokenizer-metrics-allowed-custom-labels label1 label2' is set.",
1767
+ )
1212
1768
  parser.add_argument(
1213
1769
  "--bucket-time-to-first-token",
1214
1770
  type=float,
@@ -1239,8 +1795,8 @@ class ServerArgs:
1239
1795
  bucket_rule = (
1240
1796
  "Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
1241
1797
  "generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
1242
- "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> "
1243
- "<value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500')."
1798
+ "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'custom <value1> "
1799
+ "<value2> ...' uses custom bucket values (e.g., 'custom 10 50 100 500')."
1244
1800
  )
1245
1801
  parser.add_argument(
1246
1802
  "--prompt-tokens-buckets",
@@ -1280,6 +1836,17 @@ class ServerArgs:
1280
1836
  default=None,
1281
1837
  help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
1282
1838
  )
1839
+ parser.add_argument(
1840
+ "--enable-trace",
1841
+ action="store_true",
1842
+ help="Enable opentelemetry trace",
1843
+ )
1844
+ parser.add_argument(
1845
+ "--oltp-traces-endpoint",
1846
+ type=str,
1847
+ default="localhost:4317",
1848
+ help="Config opentelemetry collector endpoint if --enable-trace is set. format: <ip>:<port>",
1849
+ )
1283
1850
 
1284
1851
  # API related
1285
1852
  parser.add_argument(
@@ -1338,6 +1905,16 @@ class ServerArgs:
1338
1905
  default=ServerArgs.tool_call_parser,
1339
1906
  help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
1340
1907
  )
1908
+ parser.add_argument(
1909
+ "--sampling-defaults",
1910
+ type=str,
1911
+ choices=["openai", "model"],
1912
+ default=ServerArgs.sampling_defaults,
1913
+ help="Where to get default sampling parameters. "
1914
+ "'openai' uses SGLang/OpenAI defaults (temperature=1.0, top_p=1.0, etc.). "
1915
+ "'model' uses the model's generation_config.json to get the recommended "
1916
+ "sampling parameters if available. Default is 'model'.",
1917
+ )
1341
1918
  parser.add_argument(
1342
1919
  "--tool-server",
1343
1920
  type=str,
@@ -1364,6 +1941,18 @@ class ServerArgs:
1364
1941
  "minimum_tokens",
1365
1942
  ],
1366
1943
  )
1944
+ parser.add_argument(
1945
+ "--load-watch-interval",
1946
+ type=float,
1947
+ default=ServerArgs.load_watch_interval,
1948
+ help="The interval of load watching in seconds.",
1949
+ )
1950
+ parser.add_argument(
1951
+ "--prefill-round-robin-balance",
1952
+ default=ServerArgs.prefill_round_robin_balance,
1953
+ action="store_true",
1954
+ help="Prefill is round robin balanced. This is used to promise decode server can get the correct dp rank.",
1955
+ )
1367
1956
 
1368
1957
  # Multi-node distributed serving
1369
1958
  parser.add_argument(
@@ -1438,9 +2027,17 @@ class ServerArgs:
1438
2027
  parser.add_argument(
1439
2028
  "--lora-backend",
1440
2029
  type=str,
1441
- default="triton",
2030
+ choices=LORA_BACKEND_CHOICES,
2031
+ default=ServerArgs.lora_backend,
1442
2032
  help="Choose the kernel backend for multi-LoRA serving.",
1443
2033
  )
2034
+ parser.add_argument(
2035
+ "--max-lora-chunk-size",
2036
+ type=int,
2037
+ default=ServerArgs.max_lora_chunk_size,
2038
+ choices=[16, 32, 64, 128],
2039
+ help="Maximum chunk size for the ChunkedSGMV LoRA backend. Only used when --lora-backend is 'csgmv'. Choosing a larger value might improve performance.",
2040
+ )
1444
2041
 
1445
2042
  # Kernel backend
1446
2043
  parser.add_argument(
@@ -1474,30 +2071,51 @@ class ServerArgs:
1474
2071
  parser.add_argument(
1475
2072
  "--grammar-backend",
1476
2073
  type=str,
1477
- choices=["xgrammar", "outlines", "llguidance", "none"],
2074
+ choices=GRAMMAR_BACKEND_CHOICES,
1478
2075
  default=ServerArgs.grammar_backend,
1479
2076
  help="Choose the backend for grammar-guided decoding.",
1480
2077
  )
1481
2078
  parser.add_argument(
1482
2079
  "--mm-attention-backend",
1483
2080
  type=str,
1484
- choices=["sdpa", "fa3", "triton_attn"],
2081
+ choices=["sdpa", "fa3", "triton_attn", "ascend_attn"],
1485
2082
  default=ServerArgs.mm_attention_backend,
1486
2083
  help="Set multimodal attention backend.",
1487
2084
  )
2085
+ parser.add_argument(
2086
+ "--nsa-prefill",
2087
+ default=ServerArgs.nsa_prefill,
2088
+ type=str,
2089
+ choices=NSA_CHOICES,
2090
+ )
2091
+ parser.add_argument(
2092
+ "--nsa-decode",
2093
+ default=ServerArgs.nsa_decode,
2094
+ type=str,
2095
+ choices=NSA_CHOICES,
2096
+ )
1488
2097
 
1489
2098
  # Speculative decoding
1490
2099
  parser.add_argument(
1491
2100
  "--speculative-algorithm",
1492
2101
  type=str,
1493
- choices=["EAGLE", "EAGLE3", "NEXTN"],
2102
+ choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE", "NGRAM"],
1494
2103
  help="Speculative algorithm.",
1495
2104
  )
1496
2105
  parser.add_argument(
1497
2106
  "--speculative-draft-model-path",
2107
+ "--speculative-draft-model",
1498
2108
  type=str,
1499
2109
  help="The path of the draft model weights. This can be a local folder or a Hugging Face repo ID.",
1500
2110
  )
2111
+ parser.add_argument(
2112
+ "--speculative-draft-model-revision",
2113
+ type=str,
2114
+ default=None,
2115
+ help="The specific draft model version to use. It can be a branch "
2116
+ "name, a tag name, or a commit id. If unspecified, will use "
2117
+ "the default version.",
2118
+ )
1501
2119
  parser.add_argument(
1502
2120
  "--speculative-num-steps",
1503
2121
  type=int,
@@ -1534,6 +2152,57 @@ class ServerArgs:
1534
2152
  help="The path of the draft model's small vocab table.",
1535
2153
  default=ServerArgs.speculative_token_map,
1536
2154
  )
2155
+ parser.add_argument(
2156
+ "--speculative-attention-mode",
2157
+ type=str,
2158
+ choices=["prefill", "decode"],
2159
+ help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
2160
+ default=ServerArgs.speculative_attention_mode,
2161
+ )
2162
+ # Ngram speculative decoding
2163
+ parser.add_argument(
2164
+ "--speculative-ngram-min-match-window-size",
2165
+ type=int,
2166
+ default=ServerArgs.speculative_ngram_min_match_window_size,
2167
+ help="The minimum window size for pattern matching in ngram speculative decoding.",
2168
+ )
2169
+ parser.add_argument(
2170
+ "--speculative-ngram-max-match-window-size",
2171
+ type=int,
2172
+ default=ServerArgs.speculative_ngram_max_match_window_size,
2173
+ help="The maximum window size for pattern matching in ngram speculative decoding.",
2174
+ )
2175
+ parser.add_argument(
2176
+ "--speculative-ngram-min-bfs-breadth",
2177
+ type=int,
2178
+ default=ServerArgs.speculative_ngram_min_bfs_breadth,
2179
+ help="The minimum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
2180
+ )
2181
+ parser.add_argument(
2182
+ "--speculative-ngram-max-bfs-breadth",
2183
+ type=int,
2184
+ default=ServerArgs.speculative_ngram_max_bfs_breadth,
2185
+ help="The maximum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
2186
+ )
2187
+ parser.add_argument(
2188
+ "--speculative-ngram-match-type",
2189
+ type=str,
2190
+ choices=["BFS", "PROB"],
2191
+ default=ServerArgs.speculative_ngram_match_type,
2192
+ help="The match type for cache tree.",
2193
+ )
2194
+ parser.add_argument(
2195
+ "--speculative-ngram-branch-length",
2196
+ type=int,
2197
+ default=ServerArgs.speculative_ngram_branch_length,
2198
+ help="The branch length for ngram speculative decoding.",
2199
+ )
2200
+ parser.add_argument(
2201
+ "--speculative-ngram-capacity",
2202
+ type=int,
2203
+ default=ServerArgs.speculative_ngram_capacity,
2204
+ help="The cache capacity for ngram speculative decoding.",
2205
+ )
1537
2206
 
1538
2207
  # Expert parallelism
1539
2208
  parser.add_argument(
@@ -1554,21 +2223,14 @@ class ServerArgs:
1554
2223
  parser.add_argument(
1555
2224
  "--moe-runner-backend",
1556
2225
  type=str,
1557
- choices=[
1558
- "auto",
1559
- "triton",
1560
- "triton_kernel",
1561
- "flashinfer_trtllm",
1562
- "flashinfer_cutlass",
1563
- "flashinfer_mxfp4",
1564
- ],
2226
+ choices=MOE_RUNNER_BACKEND_CHOICES,
1565
2227
  default=ServerArgs.moe_runner_backend,
1566
2228
  help="Choose the runner backend for MoE.",
1567
2229
  )
1568
2230
  parser.add_argument(
1569
2231
  "--flashinfer-mxfp4-moe-precision",
1570
2232
  type=str,
1571
- choices=["mxfp4", "bf16"],
2233
+ choices=["default", "bf16"],
1572
2234
  default=ServerArgs.flashinfer_mxfp4_moe_precision,
1573
2235
  help="Choose the computation precision of flashinfer mxfp4 moe",
1574
2236
  )
@@ -1661,6 +2323,27 @@ class ServerArgs:
1661
2323
  help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
1662
2324
  )
1663
2325
 
2326
+ # Mamba Cache
2327
+ parser.add_argument(
2328
+ "--max-mamba-cache-size",
2329
+ type=int,
2330
+ default=ServerArgs.max_mamba_cache_size,
2331
+ help="The maximum size of the mamba cache.",
2332
+ )
2333
+ parser.add_argument(
2334
+ "--mamba-ssm-dtype",
2335
+ type=str,
2336
+ default=ServerArgs.mamba_ssm_dtype,
2337
+ choices=["float32", "bfloat16"],
2338
+ help="The data type of the SSM states in mamba cache.",
2339
+ )
2340
+ # Args for multi-item-scoring
2341
+ parser.add_argument(
2342
+ "--multi-item-scoring-delimiter",
2343
+ type=int,
2344
+ default=ServerArgs.multi_item_scoring_delimiter,
2345
+ help="Delimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: Query<delimiter>Item1<delimiter>Item2<delimiter>... This enables efficient batch processing of multiple items against a single query.",
2346
+ )
1664
2347
  # Hierarchical cache
1665
2348
  parser.add_argument(
1666
2349
  "--enable-hierarchical-cache",
@@ -1686,6 +2369,13 @@ class ServerArgs:
1686
2369
  default=ServerArgs.hicache_write_policy,
1687
2370
  help="The write policy of hierarchical cache.",
1688
2371
  )
2372
+ parser.add_argument(
2373
+ "--radix-eviction-policy",
2374
+ type=str,
2375
+ choices=RADIX_EVICTION_POLICY_CHOICES,
2376
+ default=ServerArgs.radix_eviction_policy,
2377
+ help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
2378
+ )
1689
2379
  parser.add_argument(
1690
2380
  "--hicache-io-backend",
1691
2381
  type=str,
@@ -1696,16 +2386,19 @@ class ServerArgs:
1696
2386
  parser.add_argument(
1697
2387
  "--hicache-mem-layout",
1698
2388
  type=str,
1699
- choices=["layer_first", "page_first"],
2389
+ choices=["layer_first", "page_first", "page_first_direct"],
1700
2390
  default=ServerArgs.hicache_mem_layout,
1701
2391
  help="The layout of host memory pool for hierarchical cache.",
1702
2392
  )
1703
2393
  parser.add_argument(
1704
2394
  "--hicache-storage-backend",
1705
2395
  type=str,
1706
- choices=["file", "mooncake", "hf3fs", "nixl"],
2396
+ choices=["file", "mooncake", "hf3fs", "nixl", "aibrix", "dynamic", "eic"],
1707
2397
  default=ServerArgs.hicache_storage_backend,
1708
- help="The storage backend for hierarchical KV cache.",
2398
+ help="The storage backend for hierarchical KV cache. "
2399
+ "Built-in backends: file, mooncake, hf3fs, nixl, aibrix. "
2400
+ "For dynamic backend, use --hicache-storage-backend-extra-config to specify: "
2401
+ "backend_name (custom name), module_path (Python module path), class_name (backend class name).",
1709
2402
  )
1710
2403
  parser.add_argument(
1711
2404
  "--hicache-storage-prefetch-policy",
@@ -1720,6 +2413,12 @@ class ServerArgs:
1720
2413
  default=ServerArgs.hicache_storage_backend_extra_config,
1721
2414
  help="A dictionary in JSON string format containing extra configuration for the storage backend.",
1722
2415
  )
2416
+ # LMCache
2417
+ parser.add_argument(
2418
+ "--enable-lmcache",
2419
+ action="store_true",
2420
+ help="Using LMCache as an alternative hierarchical cache solution",
2421
+ )
1723
2422
 
1724
2423
  # Double Sparsity
1725
2424
  parser.add_argument(
@@ -1863,6 +2562,11 @@ class ServerArgs:
1863
2562
  action="store_true",
1864
2563
  help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
1865
2564
  )
2565
+ parser.add_argument(
2566
+ "--enable-torch-symm-mem",
2567
+ action="store_true",
2568
+ help="Enable using torch symm mem for all-reduce kernel and fall back to NCCL. Only supports CUDA device SM90 and above. SM90 supports world size 4, 6, 8. SM10 supports world size 6, 8.",
2569
+ )
1866
2570
  parser.add_argument(
1867
2571
  "--disable-overlap-schedule",
1868
2572
  action="store_true",
@@ -1888,6 +2592,11 @@ class ServerArgs:
1888
2592
  action="store_true",
1889
2593
  help="Enabling two micro batches to overlap.",
1890
2594
  )
2595
+ parser.add_argument(
2596
+ "--enable-single-batch-overlap",
2597
+ action="store_true",
2598
+ help="Let computation and communication overlap within one micro batch.",
2599
+ )
1891
2600
  parser.add_argument(
1892
2601
  "--tbo-token-distribution-threshold",
1893
2602
  type=float,
@@ -1933,6 +2642,12 @@ class ServerArgs:
1933
2642
  default=ServerArgs.triton_attention_num_kv_splits,
1934
2643
  help="The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8.",
1935
2644
  )
2645
+ parser.add_argument(
2646
+ "--triton-attention-split-tile-size",
2647
+ type=int,
2648
+ default=ServerArgs.triton_attention_split_tile_size,
2649
+ help="The size of split KV tile in flash decoding Triton kernel. Used for deterministic inference.",
2650
+ )
1936
2651
  parser.add_argument(
1937
2652
  "--num-continuous-decode-steps",
1938
2653
  type=int,
@@ -1951,6 +2666,11 @@ class ServerArgs:
1951
2666
  action="store_true",
1952
2667
  help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
1953
2668
  )
2669
+ parser.add_argument(
2670
+ "--enable-weights-cpu-backup",
2671
+ action="store_true",
2672
+ help="Save model weights to CPU memory during release_weights_occupation and resume_weights_occupation",
2673
+ )
1954
2674
  parser.add_argument(
1955
2675
  "--allow-auto-truncate",
1956
2676
  action="store_true",
@@ -1981,6 +2701,11 @@ class ServerArgs:
1981
2701
  action="store_true",
1982
2702
  help="Adopt base image processor instead of fast image processor.",
1983
2703
  )
2704
+ parser.add_argument(
2705
+ "--keep-mm-feature-on-device",
2706
+ action="store_true",
2707
+ help="Keep multimodal feature tensors on device after processing to save D2H copy.",
2708
+ )
1984
2709
  parser.add_argument(
1985
2710
  "--enable-return-hidden-states",
1986
2711
  action="store_true",
@@ -1992,6 +2717,12 @@ class ServerArgs:
1992
2717
  default=ServerArgs.scheduler_recv_interval,
1993
2718
  help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
1994
2719
  )
2720
+ parser.add_argument(
2721
+ "--numa-node",
2722
+ type=int,
2723
+ nargs="+",
2724
+ help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
2725
+ )
1995
2726
 
1996
2727
  # Debug tensor dumps
1997
2728
  parser.add_argument(
@@ -2017,12 +2748,29 @@ class ServerArgs:
2017
2748
  action="store_true",
2018
2749
  help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
2019
2750
  )
2751
+ parser.add_argument(
2752
+ "--enable-dynamic-batch-tokenizer",
2753
+ action="store_true",
2754
+ help="Enable async dynamic batch tokenizer for improved performance when multiple requests arrive concurrently.",
2755
+ )
2756
+ parser.add_argument(
2757
+ "--dynamic-batch-tokenizer-batch-size",
2758
+ type=int,
2759
+ default=ServerArgs.dynamic_batch_tokenizer_batch_size,
2760
+ help="[Only used if --enable-dynamic-batch-tokenizer is set] Maximum batch size for dynamic batch tokenizer.",
2761
+ )
2762
+ parser.add_argument(
2763
+ "--dynamic-batch-tokenizer-batch-timeout",
2764
+ type=float,
2765
+ default=ServerArgs.dynamic_batch_tokenizer_batch_timeout,
2766
+ help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
2767
+ )
2020
2768
 
2021
2769
  # PD disaggregation
2022
2770
  parser.add_argument(
2023
2771
  "--disaggregation-mode",
2024
2772
  type=str,
2025
- default="null",
2773
+ default=ServerArgs.disaggregation_mode,
2026
2774
  choices=["null", "prefill", "decode"],
2027
2775
  help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
2028
2776
  )
@@ -2065,6 +2813,11 @@ class ServerArgs:
2065
2813
  "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
2066
2814
  "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
2067
2815
  )
2816
+ parser.add_argument(
2817
+ "--disaggregation-decode-enable-offload-kvcache",
2818
+ action="store_true",
2819
+ help="Enable async KV cache offloading on decode server (PD mode).",
2820
+ )
2068
2821
  parser.add_argument(
2069
2822
  "--num-reserved-decode-tokens",
2070
2823
  type=int,
@@ -2072,10 +2825,10 @@ class ServerArgs:
2072
2825
  help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
2073
2826
  )
2074
2827
  parser.add_argument(
2075
- "--pdlb-url",
2076
- type=str,
2077
- default=None,
2078
- help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
2828
+ "--disaggregation-decode-polling-interval",
2829
+ type=int,
2830
+ default=ServerArgs.disaggregation_decode_polling_interval,
2831
+ help="The interval to poll requests in decode server. Can be set to >1 to reduce the overhead of this.",
2079
2832
  )
2080
2833
 
2081
2834
  # Custom weight loader
@@ -2091,6 +2844,24 @@ class ServerArgs:
2091
2844
  action="store_true",
2092
2845
  help="Disable mmap while loading weight using safetensors.",
2093
2846
  )
2847
+ parser.add_argument(
2848
+ "--remote-instance-weight-loader-seed-instance-ip",
2849
+ type=str,
2850
+ default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
2851
+ help="The ip of the seed instance for loading weights from remote instance.",
2852
+ )
2853
+ parser.add_argument(
2854
+ "--remote-instance-weight-loader-seed-instance-service-port",
2855
+ type=int,
2856
+ default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
2857
+ help="The service port of the seed instance for loading weights from remote instance.",
2858
+ )
2859
+ parser.add_argument(
2860
+ "--remote-instance-weight-loader-send-weights-group-ports",
2861
+ type=json_list_type,
2862
+ default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
2863
+ help="The communication group ports for loading weights from remote instance.",
2864
+ )
2094
2865
 
2095
2866
  # For PD-Multiplexing
2096
2867
  parser.add_argument(
@@ -2106,36 +2877,55 @@ class ServerArgs:
2106
2877
  help="Number of sm partition groups.",
2107
2878
  )
2108
2879
 
2880
+ # For deterministic inference
2881
+ parser.add_argument(
2882
+ "--enable-deterministic-inference",
2883
+ action="store_true",
2884
+ help="Enable deterministic inference mode with batch invariant ops.",
2885
+ )
2886
+
2109
2887
  # Deprecated arguments
2110
2888
  parser.add_argument(
2111
2889
  "--enable-ep-moe",
2112
- action="store_true",
2113
- help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
2890
+ action=DeprecatedAction,
2891
+ help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
2114
2892
  )
2115
2893
  parser.add_argument(
2116
2894
  "--enable-deepep-moe",
2117
- action="store_true",
2118
- help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
2895
+ action=DeprecatedAction,
2896
+ help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
2119
2897
  )
2120
2898
  parser.add_argument(
2121
2899
  "--enable-flashinfer-cutlass-moe",
2122
- action="store_true",
2123
- help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
2900
+ action=DeprecatedAction,
2901
+ help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
2902
+ )
2903
+ parser.add_argument(
2904
+ "--enable-flashinfer-cutedsl-moe",
2905
+ action=DeprecatedAction,
2906
+ help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
2124
2907
  )
2125
2908
  parser.add_argument(
2126
2909
  "--enable-flashinfer-trtllm-moe",
2127
- action="store_true",
2128
- help="(Deprecated) Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
2910
+ action=DeprecatedAction,
2911
+ help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
2129
2912
  )
2130
2913
  parser.add_argument(
2131
2914
  "--enable-triton-kernel-moe",
2132
- action="store_true",
2133
- help="(Deprecated) Use triton moe grouped gemm kernel.",
2915
+ action=DeprecatedAction,
2916
+ help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
2134
2917
  )
2135
2918
  parser.add_argument(
2136
2919
  "--enable-flashinfer-mxfp4-moe",
2137
- action="store_true",
2138
- help="(Deprecated) Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
2920
+ action=DeprecatedAction,
2921
+ help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
2922
+ )
2923
+
2924
+ # Configuration file support
2925
+ parser.add_argument(
2926
+ "--config",
2927
+ type=str,
2928
+ help="Read CLI options from a config file. Must be a YAML file with configuration options.",
2139
2929
  )
2140
2930
 
2141
2931
  @classmethod
@@ -2144,6 +2934,7 @@ class ServerArgs:
2144
2934
  args.pp_size = args.pipeline_parallel_size
2145
2935
  args.dp_size = args.data_parallel_size
2146
2936
  args.ep_size = args.expert_parallel_size
2937
+
2147
2938
  attrs = [attr.name for attr in dataclasses.fields(cls)]
2148
2939
  return cls(**{attr: getattr(args, attr) for attr in attrs})
2149
2940
 
@@ -2200,7 +2991,8 @@ class ServerArgs:
2200
2991
 
2201
2992
  # Check chunked prefill
2202
2993
  # Skip validation if chunked prefill is disabled (i.e., size <= 0).
2203
- if self.chunked_prefill_size > 0:
2994
+ # Skip validation if disaggregation mode is decode.
2995
+ if self.chunked_prefill_size > 0 and self.disaggregation_mode != "decode":
2204
2996
  assert (
2205
2997
  self.chunked_prefill_size % self.page_size == 0
2206
2998
  ), "chunked_prefill_size must be divisible by page_size"
@@ -2214,6 +3006,24 @@ class ServerArgs:
2214
3006
  "--generation-tokens-buckets", self.generation_tokens_buckets
2215
3007
  )
2216
3008
 
3009
+ # Check scheduling policy
3010
+ if self.enable_priority_scheduling:
3011
+ assert self.schedule_policy in [
3012
+ "fcfs",
3013
+ "lof",
3014
+ ], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
3015
+
3016
+ # Check multi-item scoring
3017
+ if self.multi_item_scoring_delimiter is not None:
3018
+ assert self.disable_radix_cache, (
3019
+ "Multi-item scoring requires radix cache to be disabled. "
3020
+ "Please set --disable-radix-cache when using --multi-item-scoring-delimiter."
3021
+ )
3022
+ assert self.chunked_prefill_size == -1, (
3023
+ "Multi-item scoring requires chunked prefill to be disabled. "
3024
+ "Please set --chunked-prefill-size -1 when using --multi-item-scoring-delimiter."
3025
+ )
3026
+
2217
3027
  def check_lora_server_args(self):
2218
3028
  assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
2219
3029
 
@@ -2297,6 +3107,12 @@ class ServerArgs:
2297
3107
  f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
2298
3108
  )
2299
3109
 
3110
+ if self.max_lora_chunk_size is not None:
3111
+ assert (
3112
+ 16 <= self.max_lora_chunk_size <= 128
3113
+ and (self.max_lora_chunk_size & (self.max_lora_chunk_size - 1)) == 0
3114
+ ), "--max-lora-chunk-size must be a power of 2 between 16 and 128."
3115
+
2300
3116
  def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
2301
3117
  larger_tp = max(decode_tp, prefill_tp)
2302
3118
  smaller_tp = min(decode_tp, prefill_tp)
@@ -2314,8 +3130,8 @@ class ServerArgs:
2314
3130
  assert rule in [
2315
3131
  "tse",
2316
3132
  "default",
2317
- "customer",
2318
- ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'customer'"
3133
+ "custom",
3134
+ ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'custom'"
2319
3135
 
2320
3136
  if rule == "tse":
2321
3137
  assert (
@@ -2338,95 +3154,20 @@ class ServerArgs:
2338
3154
  len(buckets_rule) == 1
2339
3155
  ), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
2340
3156
 
2341
- elif rule == "customer":
3157
+ elif rule == "custom":
2342
3158
  assert (
2343
3159
  len(buckets_rule) >= 2
2344
- ), f"{arg_name} customer rule requires at least one bucket value: ['customer', value1, ...]"
3160
+ ), f"{arg_name} custom rule requires at least one bucket value: ['custom', value1, ...]"
2345
3161
  try:
2346
3162
  bucket_values = [float(x) for x in buckets_rule[1:]]
2347
3163
  except ValueError:
2348
- assert False, f"{arg_name} customer rule bucket values must be numeric"
3164
+ assert False, f"{arg_name} custom rule bucket values must be numeric"
2349
3165
  assert len(set(bucket_values)) == len(
2350
3166
  bucket_values
2351
- ), f"{arg_name} customer rule bucket values should not contain duplicates"
3167
+ ), f"{arg_name} custom rule bucket values should not contain duplicates"
2352
3168
  assert all(
2353
3169
  val >= 0 for val in bucket_values
2354
- ), f"{arg_name} customer rule bucket values should be non-negative"
2355
-
2356
- def model_specific_adjustments(self):
2357
- hf_config = self.get_hf_config()
2358
- model_arch = hf_config.architectures[0]
2359
- if model_arch in ["GptOssForCausalLM"]:
2360
- if self.attention_backend is None:
2361
- if is_cuda() and is_sm100_supported():
2362
- self.attention_backend = "trtllm_mha"
2363
- elif is_cuda() and is_sm90_supported():
2364
- self.attention_backend = "fa3"
2365
- else:
2366
- self.attention_backend = "triton"
2367
- supported_backends = ["triton", "trtllm_mha", "fa3"]
2368
- logger.info(
2369
- f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
2370
- )
2371
- assert (
2372
- self.attention_backend in supported_backends
2373
- ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
2374
-
2375
- if is_sm100_supported():
2376
- if not self.enable_dp_attention:
2377
- self.enable_flashinfer_allreduce_fusion = True
2378
- logger.info(
2379
- "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
2380
- )
2381
- quantization_config = getattr(hf_config, "quantization_config", None)
2382
- is_mxfp4_quant_format = (
2383
- quantization_config is not None
2384
- and quantization_config.get("quant_method") == "mxfp4"
2385
- )
2386
-
2387
- if is_sm100_supported() and is_mxfp4_quant_format:
2388
- self.moe_runner_backend = "flashinfer_mxfp4"
2389
- logger.warning(
2390
- "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
2391
- )
2392
- else:
2393
- if self.moe_runner_backend == "triton_kernel":
2394
- assert (
2395
- self.ep_size == 1
2396
- ), "Triton kernel MoE is only supported when ep_size == 1"
2397
- if (
2398
- self.moe_runner_backend == "auto"
2399
- and self.ep_size == 1
2400
- and is_triton_kernels_available()
2401
- ):
2402
- self.moe_runner_backend = "triton_kernel"
2403
- logger.warning(
2404
- "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
2405
- )
2406
- self.disable_hybrid_swa_memory = True
2407
- if is_mxfp4_quant_format:
2408
- # use bf16 for mxfp4 triton kernels
2409
- self.dtype = "bfloat16"
2410
-
2411
- elif "Llama4" in model_arch:
2412
- assert self.attention_backend in {
2413
- "fa3",
2414
- "aiter",
2415
- "triton",
2416
- }, "fa3, aiter, or triton is required for Llama4 model"
2417
- elif model_arch in [
2418
- "Gemma2ForCausalLM",
2419
- "Gemma3ForCausalLM",
2420
- "Gemma3ForConditionalGeneration",
2421
- "Gemma3nForCausalLM",
2422
- "Gemma3nForConditionalGeneration",
2423
- ]:
2424
- # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
2425
- # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
2426
- logger.warning(
2427
- f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
2428
- )
2429
- self.disable_hybrid_swa_memory = True
3170
+ ), f"{arg_name} custom rule bucket values should be non-negative"
2430
3171
 
2431
3172
  def adjust_mem_fraction_for_vlm(self, model_config):
2432
3173
  vision_config = getattr(model_config.hf_config, "vision_config", None)
@@ -2478,6 +3219,26 @@ def prepare_server_args(argv: List[str]) -> ServerArgs:
2478
3219
  Returns:
2479
3220
  The server arguments.
2480
3221
  """
3222
+ # Import here to avoid circular imports
3223
+ from sglang.srt.server_args_config_parser import ConfigArgumentMerger
3224
+
3225
+ # Check for config file and merge arguments if present
3226
+ if "--config" in argv:
3227
+ # Extract boolean actions from the parser to handle them correctly
3228
+ parser = argparse.ArgumentParser()
3229
+ ServerArgs.add_cli_args(parser)
3230
+
3231
+ # Get boolean action destinations
3232
+ boolean_actions = []
3233
+ for action in parser._actions:
3234
+ if hasattr(action, "dest") and hasattr(action, "action"):
3235
+ if action.action in ["store_true", "store_false"]:
3236
+ boolean_actions.append(action.dest)
3237
+
3238
+ # Merge config file arguments with CLI arguments
3239
+ config_merger = ConfigArgumentMerger(boolean_actions=boolean_actions)
3240
+ argv = config_merger.merge_config_with_args(argv)
3241
+
2481
3242
  parser = argparse.ArgumentParser()
2482
3243
  ServerArgs.add_cli_args(parser)
2483
3244
  raw_args = parser.parse_args(argv)
@@ -2612,14 +3373,19 @@ def auto_choose_speculative_params(self: ServerArgs):
2612
3373
  """
2613
3374
  hf_config = self.get_hf_config()
2614
3375
  arch = hf_config.architectures[0]
2615
-
3376
+ if self.speculative_algorithm == "STANDALONE":
3377
+ # The default value for standalone speculative decoding
3378
+ return (3, 1, 4)
2616
3379
  if arch in ["LlamaForCausalLM"]:
2617
3380
  # The default value for llama
2618
3381
  return (5, 4, 8)
2619
3382
  elif arch in [
3383
+ "DeepseekV32ForCausalLM",
2620
3384
  "DeepseekV3ForCausalLM",
2621
3385
  "DeepseekV2ForCausalLM",
2622
3386
  "GptOssForCausalLM",
3387
+ "BailingMoeForCausalLM",
3388
+ "BailingMoeV2ForCausalLM",
2623
3389
  ]:
2624
3390
  # The default value for deepseek and gpt-oss
2625
3391
  return (3, 1, 4)