sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (395) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/lang/interpreter.py +1 -1
  7. sglang/launch_server.py +14 -0
  8. sglang/profiler.py +2 -2
  9. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  10. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  11. sglang/srt/configs/__init__.py +8 -0
  12. sglang/srt/configs/device_config.py +3 -1
  13. sglang/srt/configs/dots_ocr.py +64 -0
  14. sglang/srt/configs/dots_vlm.py +139 -0
  15. sglang/srt/configs/falcon_h1.py +360 -0
  16. sglang/srt/configs/internvl.py +6 -0
  17. sglang/srt/configs/load_config.py +9 -0
  18. sglang/srt/configs/model_config.py +181 -82
  19. sglang/srt/configs/qwen3_next.py +326 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +71 -19
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +326 -53
  44. sglang/srt/disaggregation/prefill.py +36 -17
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +192 -113
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  52. sglang/srt/entrypoints/grpc_server.py +810 -0
  53. sglang/srt/entrypoints/http_server.py +132 -57
  54. sglang/srt/entrypoints/openai/protocol.py +115 -7
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +207 -58
  57. sglang/srt/entrypoints/openai/serving_completions.py +17 -4
  58. sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +49 -4
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/environ.py +285 -0
  63. sglang/srt/eplb/eplb_manager.py +2 -2
  64. sglang/srt/eplb/expert_distribution.py +26 -13
  65. sglang/srt/eplb/expert_location.py +38 -8
  66. sglang/srt/eplb/expert_location_updater.py +1 -1
  67. sglang/srt/function_call/base_format_detector.py +3 -6
  68. sglang/srt/function_call/ebnf_composer.py +11 -9
  69. sglang/srt/function_call/function_call_parser.py +9 -2
  70. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  71. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  72. sglang/srt/function_call/json_array_parser.py +63 -0
  73. sglang/srt/function_call/kimik2_detector.py +17 -4
  74. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  75. sglang/srt/function_call/utils.py +96 -5
  76. sglang/srt/grpc/__init__.py +1 -0
  77. sglang/srt/grpc/compile_proto.py +245 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  81. sglang/srt/layers/activation.py +143 -9
  82. sglang/srt/layers/attention/aiter_backend.py +106 -82
  83. sglang/srt/layers/attention/ascend_backend.py +115 -9
  84. sglang/srt/layers/attention/attention_registry.py +206 -0
  85. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  86. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  87. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  88. sglang/srt/layers/attention/fla/chunk.py +242 -0
  89. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  90. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  91. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  92. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  93. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  94. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  95. sglang/srt/layers/attention/fla/index.py +37 -0
  96. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  97. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  98. sglang/srt/layers/attention/fla/op.py +66 -0
  99. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  100. sglang/srt/layers/attention/fla/utils.py +331 -0
  101. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  102. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  103. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  104. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  105. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  106. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  107. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  108. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  109. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  111. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  112. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  113. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  114. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  115. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  121. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  122. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  123. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  124. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  125. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  126. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  127. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  128. sglang/srt/layers/attention/nsa/utils.py +24 -0
  129. sglang/srt/layers/attention/nsa_backend.py +887 -0
  130. sglang/srt/layers/attention/tbo_backend.py +6 -6
  131. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  132. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  133. sglang/srt/layers/attention/triton_backend.py +57 -7
  134. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  135. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  136. sglang/srt/layers/attention/vision.py +58 -0
  137. sglang/srt/layers/attention/wave_backend.py +4 -4
  138. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  139. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  140. sglang/srt/layers/communicator.py +53 -7
  141. sglang/srt/layers/dp_attention.py +41 -2
  142. sglang/srt/layers/elementwise.py +3 -1
  143. sglang/srt/layers/layernorm.py +34 -15
  144. sglang/srt/layers/linear.py +55 -7
  145. sglang/srt/layers/logits_processor.py +44 -12
  146. sglang/srt/layers/moe/__init__.py +2 -1
  147. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  148. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  149. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  150. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  151. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  167. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  169. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  170. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  171. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  172. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  173. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  174. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  175. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  176. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  177. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  178. sglang/srt/layers/moe/topk.py +30 -9
  179. sglang/srt/layers/moe/utils.py +22 -7
  180. sglang/srt/layers/parameter.py +23 -6
  181. sglang/srt/layers/quantization/awq.py +19 -7
  182. sglang/srt/layers/quantization/base_config.py +11 -6
  183. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  184. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  185. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  186. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  187. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  188. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  189. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  190. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  191. sglang/srt/layers/quantization/fp8.py +78 -49
  192. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  193. sglang/srt/layers/quantization/gptq.py +25 -17
  194. sglang/srt/layers/quantization/modelopt_quant.py +225 -57
  195. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  196. sglang/srt/layers/quantization/mxfp4.py +77 -42
  197. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  199. sglang/srt/layers/quantization/quark/utils.py +97 -0
  200. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  201. sglang/srt/layers/quantization/unquant.py +135 -47
  202. sglang/srt/layers/quantization/w4afp8.py +26 -17
  203. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  204. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  205. sglang/srt/layers/rocm_linear_utils.py +44 -0
  206. sglang/srt/layers/rotary_embedding.py +78 -49
  207. sglang/srt/layers/sampler.py +213 -21
  208. sglang/srt/layers/utils.py +23 -0
  209. sglang/srt/lora/backend/base_backend.py +50 -8
  210. sglang/srt/lora/backend/chunked_backend.py +348 -0
  211. sglang/srt/lora/backend/triton_backend.py +99 -5
  212. sglang/srt/lora/layers.py +32 -0
  213. sglang/srt/lora/lora.py +8 -3
  214. sglang/srt/lora/lora_manager.py +52 -118
  215. sglang/srt/lora/mem_pool.py +25 -11
  216. sglang/srt/lora/triton_ops/__init__.py +4 -0
  217. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  219. sglang/srt/lora/utils.py +22 -11
  220. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  221. sglang/srt/managers/cache_controller.py +215 -314
  222. sglang/srt/managers/data_parallel_controller.py +115 -80
  223. sglang/srt/managers/detokenizer_manager.py +19 -15
  224. sglang/srt/managers/disagg_service.py +46 -0
  225. sglang/srt/managers/io_struct.py +340 -109
  226. sglang/srt/managers/mm_utils.py +44 -6
  227. sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
  228. sglang/srt/managers/multimodal_processor.py +1 -2
  229. sglang/srt/managers/overlap_utils.py +53 -0
  230. sglang/srt/managers/schedule_batch.py +240 -138
  231. sglang/srt/managers/schedule_policy.py +147 -19
  232. sglang/srt/managers/scheduler.py +501 -304
  233. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  234. sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
  235. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  236. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  237. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  238. sglang/srt/managers/template_manager.py +3 -3
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +321 -632
  241. sglang/srt/managers/tp_worker.py +81 -22
  242. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  243. sglang/srt/managers/utils.py +1 -45
  244. sglang/srt/mem_cache/allocator.py +15 -21
  245. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  246. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  247. sglang/srt/mem_cache/chunk_cache.py +8 -1
  248. sglang/srt/mem_cache/evict_policy.py +23 -0
  249. sglang/srt/mem_cache/hicache_storage.py +58 -34
  250. sglang/srt/mem_cache/hiradix_cache.py +227 -80
  251. sglang/srt/mem_cache/memory_pool.py +535 -58
  252. sglang/srt/mem_cache/memory_pool_host.py +239 -223
  253. sglang/srt/mem_cache/radix_cache.py +222 -73
  254. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  255. sglang/srt/mem_cache/storage/__init__.py +10 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  257. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  258. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  259. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  260. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  261. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  262. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  263. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
  264. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  265. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  266. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
  267. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  268. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  269. sglang/srt/metrics/collector.py +519 -132
  270. sglang/srt/metrics/func_timer.py +2 -7
  271. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  272. sglang/srt/metrics/utils.py +55 -0
  273. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  274. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  275. sglang/srt/model_executor/forward_batch_info.py +98 -57
  276. sglang/srt/model_executor/model_runner.py +433 -158
  277. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  278. sglang/srt/model_loader/__init__.py +9 -3
  279. sglang/srt/model_loader/loader.py +133 -5
  280. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  281. sglang/srt/model_loader/weight_utils.py +158 -3
  282. sglang/srt/models/apertus.py +686 -0
  283. sglang/srt/models/bailing_moe.py +820 -217
  284. sglang/srt/models/bailing_moe_nextn.py +168 -0
  285. sglang/srt/models/deepseek_nextn.py +6 -1
  286. sglang/srt/models/deepseek_v2.py +833 -152
  287. sglang/srt/models/dots_ocr.py +173 -0
  288. sglang/srt/models/dots_vlm.py +174 -0
  289. sglang/srt/models/dots_vlm_vit.py +337 -0
  290. sglang/srt/models/ernie4.py +1 -1
  291. sglang/srt/models/falcon_h1.py +576 -0
  292. sglang/srt/models/gemma3_causal.py +0 -2
  293. sglang/srt/models/gemma3_mm.py +1 -1
  294. sglang/srt/models/gemma3n_mm.py +2 -2
  295. sglang/srt/models/glm4_moe.py +14 -5
  296. sglang/srt/models/glm4_moe_nextn.py +2 -2
  297. sglang/srt/models/glm4v.py +5 -3
  298. sglang/srt/models/glm4v_moe.py +4 -1
  299. sglang/srt/models/gpt_oss.py +8 -31
  300. sglang/srt/models/internvl.py +28 -0
  301. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  302. sglang/srt/models/llama.py +4 -0
  303. sglang/srt/models/llama4.py +9 -0
  304. sglang/srt/models/llama_eagle3.py +13 -0
  305. sglang/srt/models/longcat_flash.py +3 -3
  306. sglang/srt/models/longcat_flash_nextn.py +1 -1
  307. sglang/srt/models/minicpmv.py +165 -3
  308. sglang/srt/models/mllama4.py +40 -4
  309. sglang/srt/models/opt.py +637 -0
  310. sglang/srt/models/qwen2_5_vl.py +29 -5
  311. sglang/srt/models/qwen2_audio.py +1 -1
  312. sglang/srt/models/qwen2_moe.py +124 -14
  313. sglang/srt/models/qwen2_vl.py +1 -1
  314. sglang/srt/models/qwen3.py +26 -5
  315. sglang/srt/models/qwen3_moe.py +71 -12
  316. sglang/srt/models/qwen3_next.py +1069 -0
  317. sglang/srt/models/qwen3_next_mtp.py +112 -0
  318. sglang/srt/models/qwen3_vl.py +787 -0
  319. sglang/srt/models/qwen3_vl_moe.py +471 -0
  320. sglang/srt/models/registry.py +15 -3
  321. sglang/srt/models/sarashina2_vision.py +269 -0
  322. sglang/srt/models/solar.py +505 -0
  323. sglang/srt/models/starcoder2.py +357 -0
  324. sglang/srt/models/step3_vl.py +1 -1
  325. sglang/srt/models/torch_native_llama.py +10 -3
  326. sglang/srt/models/utils.py +51 -0
  327. sglang/srt/multimodal/processors/base_processor.py +15 -7
  328. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  329. sglang/srt/multimodal/processors/glm4v.py +9 -9
  330. sglang/srt/multimodal/processors/internvl.py +153 -129
  331. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  332. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  333. sglang/srt/offloader.py +27 -3
  334. sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
  335. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  336. sglang/srt/sampling/sampling_batch_info.py +38 -17
  337. sglang/srt/sampling/sampling_params.py +7 -0
  338. sglang/srt/server_args.py +1030 -254
  339. sglang/srt/server_args_config_parser.py +146 -0
  340. sglang/srt/single_batch_overlap.py +151 -0
  341. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  342. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  343. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  344. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  345. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  346. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  347. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  348. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  349. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  350. sglang/srt/speculative/eagle_worker.py +253 -136
  351. sglang/srt/speculative/ngram_utils.py +428 -0
  352. sglang/srt/speculative/ngram_worker.py +245 -0
  353. sglang/srt/speculative/spec_info.py +52 -0
  354. sglang/srt/speculative/spec_utils.py +606 -0
  355. sglang/srt/speculative/standalone_worker.py +109 -0
  356. sglang/srt/torch_memory_saver_adapter.py +5 -7
  357. sglang/srt/tracing/trace.py +578 -0
  358. sglang/srt/two_batch_overlap.py +8 -5
  359. sglang/srt/utils/__init__.py +2 -0
  360. sglang/srt/{utils.py → utils/common.py} +445 -77
  361. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  362. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  363. sglang/srt/utils/rpd_utils.py +452 -0
  364. sglang/srt/utils/slow_rank_detector.py +71 -0
  365. sglang/srt/warmup.py +8 -4
  366. sglang/srt/weight_sync/utils.py +2 -2
  367. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  368. sglang/test/few_shot_gsm8k.py +1 -0
  369. sglang/test/get_logits_ut.py +57 -0
  370. sglang/test/run_eval.py +79 -11
  371. sglang/test/runners.py +5 -1
  372. sglang/test/simple_eval_common.py +5 -2
  373. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  374. sglang/test/test_block_fp8.py +2 -2
  375. sglang/test/test_cutlass_moe.py +24 -6
  376. sglang/test/test_deterministic.py +297 -0
  377. sglang/test/test_disaggregation_utils.py +77 -0
  378. sglang/test/test_fp4_moe.py +370 -1
  379. sglang/test/test_programs.py +1 -1
  380. sglang/test/test_utils.py +383 -5
  381. sglang/utils.py +22 -1
  382. sglang/version.py +1 -1
  383. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  384. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
  385. sglang/srt/disaggregation/launch_lb.py +0 -118
  386. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  387. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  388. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  389. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  390. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  391. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  392. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  393. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  394. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  395. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -19,14 +19,13 @@ import json
19
19
  import logging
20
20
  import os
21
21
  import random
22
- import sys
23
22
  import tempfile
24
23
  from typing import List, Literal, Optional, Union
25
24
 
25
+ from sglang.srt.connector import ConnectorType
26
26
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
27
- from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
28
27
  from sglang.srt.lora.lora_registry import LoRARef
29
- from sglang.srt.reasoning_parser import ReasoningParser
28
+ from sglang.srt.parser.reasoning_parser import ReasoningParser
30
29
  from sglang.srt.utils import (
31
30
  LORA_TARGET_ALL_MODULES,
32
31
  SUPPORTED_LORA_TARGET_MODULES,
@@ -36,18 +35,22 @@ from sglang.srt.utils import (
36
35
  is_cuda,
37
36
  is_flashinfer_available,
38
37
  is_hip,
38
+ is_npu,
39
39
  is_port_available,
40
40
  is_remote_url,
41
41
  is_sm90_supported,
42
42
  is_sm100_supported,
43
43
  is_triton_kernels_available,
44
44
  is_valid_ipv6_address,
45
+ json_list_type,
45
46
  nullable_str,
47
+ parse_connector_type,
46
48
  )
49
+ from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
50
+ from sglang.utils import is_in_ci
47
51
 
48
52
  logger = logging.getLogger(__name__)
49
53
 
50
-
51
54
  # Define constants
52
55
  LOAD_FORMAT_CHOICES = [
53
56
  "auto",
@@ -60,6 +63,7 @@ LOAD_FORMAT_CHOICES = [
60
63
  "bitsandbytes",
61
64
  "layered",
62
65
  "remote",
66
+ "remote_instance",
63
67
  ]
64
68
 
65
69
  QUANTIZATION_CHOICES = [
@@ -86,9 +90,12 @@ ATTENTION_BACKEND_CHOICES = [
86
90
  # Common
87
91
  "triton",
88
92
  "torch_native",
93
+ "flex_attention",
94
+ "nsa",
89
95
  # NVIDIA specific
90
96
  "cutlass_mla",
91
97
  "fa3",
98
+ "fa4",
92
99
  "flashinfer",
93
100
  "flashmla",
94
101
  "trtllm_mla",
@@ -102,8 +109,18 @@ ATTENTION_BACKEND_CHOICES = [
102
109
  "ascend",
103
110
  ]
104
111
 
112
+ LORA_BACKEND_CHOICES = ["triton", "csgmv"]
113
+
105
114
  DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
106
115
 
116
+ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
117
+
118
+ DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
119
+
120
+ NSA_CHOICES = ["flashmla_prefill", "flashmla_decode", "fa3", "tilelang", "aiter"]
121
+
122
+ RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
123
+
107
124
 
108
125
  # Allow external code to add more choices
109
126
  def add_load_format_choices(choices):
@@ -122,6 +139,18 @@ def add_disagg_transfer_backend_choices(choices):
122
139
  DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
123
140
 
124
141
 
142
+ def add_grammar_backend_choices(choices):
143
+ GRAMMAR_BACKEND_CHOICES.extend(choices)
144
+
145
+
146
+ def add_deterministic_attention_backend_choices(choices):
147
+ DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
148
+
149
+
150
+ def add_radix_eviction_policy_choices(choices):
151
+ RADIX_EVICTION_POLICY_CHOICES.extend(choices)
152
+
153
+
125
154
  @dataclasses.dataclass
126
155
  class ServerArgs:
127
156
  # Model and tokenizer
@@ -151,20 +180,25 @@ class ServerArgs:
151
180
  quantization: Optional[str] = None
152
181
  quantization_param_path: Optional[str] = None
153
182
  kv_cache_dtype: str = "auto"
183
+ enable_fp32_lm_head: bool = False
154
184
 
155
185
  # Memory and scheduling
156
186
  mem_fraction_static: Optional[float] = None
157
187
  max_running_requests: Optional[int] = None
158
- max_queued_requests: Optional[int] = sys.maxsize
188
+ max_queued_requests: Optional[int] = None
159
189
  max_total_tokens: Optional[int] = None
160
190
  chunked_prefill_size: Optional[int] = None
161
191
  max_prefill_tokens: int = 16384
162
192
  schedule_policy: str = "fcfs"
193
+ enable_priority_scheduling: bool = False
194
+ schedule_low_priority_values_first: bool = False
195
+ priority_scheduling_preemption_threshold: int = 10
163
196
  schedule_conservativeness: float = 1.0
164
197
  page_size: Optional[int] = None
165
198
  hybrid_kvcache_ratio: Optional[float] = None
166
199
  swa_full_tokens_ratio: float = 0.8
167
200
  disable_hybrid_swa_memory: bool = False
201
+ radix_eviction_policy: str = "lru"
168
202
 
169
203
  # Runtime options
170
204
  device: Optional[str] = None
@@ -191,14 +225,20 @@ class ServerArgs:
191
225
  show_time_cost: bool = False
192
226
  enable_metrics: bool = False
193
227
  enable_metrics_for_all_schedulers: bool = False
228
+ tokenizer_metrics_custom_labels_header: str = "x-custom-labels"
229
+ tokenizer_metrics_allowed_custom_labels: Optional[List[str]] = None
194
230
  bucket_time_to_first_token: Optional[List[float]] = None
195
231
  bucket_inter_token_latency: Optional[List[float]] = None
196
232
  bucket_e2e_request_latency: Optional[List[float]] = None
197
233
  collect_tokens_histogram: bool = False
234
+ prompt_tokens_buckets: Optional[List[str]] = None
235
+ generation_tokens_buckets: Optional[List[str]] = None
198
236
  decode_log_interval: int = 40
199
237
  enable_request_time_stats_logging: bool = False
200
238
  kv_events_config: Optional[str] = None
201
239
  gc_warning_threshold_secs: float = 0.0
240
+ enable_trace: bool = False
241
+ oltp_traces_endpoint: str = "localhost:4317"
202
242
 
203
243
  # API related
204
244
  api_key: Optional[str] = None
@@ -215,6 +255,9 @@ class ServerArgs:
215
255
  # Data parallelism
216
256
  dp_size: int = 1
217
257
  load_balance_method: str = "round_robin"
258
+ load_watch_interval: float = 0.1
259
+ # FIXME: remove this after dp rank scheduling is fully supported with PD-Disaggregation
260
+ prefill_round_robin_balance: bool = False
218
261
 
219
262
  # Multi-node distributed serving
220
263
  dist_init_addr: Optional[str] = None
@@ -235,6 +278,7 @@ class ServerArgs:
235
278
  max_loaded_loras: Optional[int] = None
236
279
  max_loras_per_batch: int = 8
237
280
  lora_backend: str = "triton"
281
+ max_lora_chunk_size: Optional[int] = 16
238
282
 
239
283
  # Kernel backend
240
284
  attention_backend: Optional[str] = None
@@ -243,16 +287,28 @@ class ServerArgs:
243
287
  sampling_backend: Optional[str] = None
244
288
  grammar_backend: Optional[str] = None
245
289
  mm_attention_backend: Optional[str] = None
290
+ nsa_prefill: str = "flashmla_prefill"
291
+ nsa_decode: str = "fa3"
246
292
 
247
293
  # Speculative decoding
248
294
  speculative_algorithm: Optional[str] = None
249
295
  speculative_draft_model_path: Optional[str] = None
296
+ speculative_draft_model_revision: Optional[str] = None
250
297
  speculative_num_steps: Optional[int] = None
251
298
  speculative_eagle_topk: Optional[int] = None
252
299
  speculative_num_draft_tokens: Optional[int] = None
253
300
  speculative_accept_threshold_single: float = 1.0
254
301
  speculative_accept_threshold_acc: float = 1.0
255
302
  speculative_token_map: Optional[str] = None
303
+ speculative_attention_mode: str = "prefill"
304
+ # For ngram only
305
+ speculative_ngram_min_match_window_size: int = 1
306
+ speculative_ngram_max_match_window_size: int = 12
307
+ speculative_ngram_min_bfs_breadth: int = 1
308
+ speculative_ngram_max_bfs_breadth: int = 10
309
+ speculative_ngram_match_type: Literal["BFS", "PROB"] = "BFS"
310
+ speculative_ngram_branch_length: int = 18
311
+ speculative_ngram_capacity: int = 10 * 1000 * 1000
256
312
 
257
313
  # Expert parallelism
258
314
  ep_size: int = 1
@@ -284,6 +340,10 @@ class ServerArgs:
284
340
  deepep_config: Optional[str] = None
285
341
  moe_dense_tp_size: Optional[int] = None
286
342
 
343
+ # Mamba cache
344
+ max_mamba_cache_size: Optional[int] = None
345
+ mamba_ssm_dtype: str = "float32"
346
+
287
347
  # Hierarchical cache
288
348
  enable_hierarchical_cache: bool = False
289
349
  hicache_ratio: float = 2.0
@@ -294,6 +354,8 @@ class ServerArgs:
294
354
  hicache_storage_backend: Optional[str] = None
295
355
  hicache_storage_prefetch_policy: str = "best_effort"
296
356
  hicache_storage_backend_extra_config: Optional[str] = None
357
+ # LMCache
358
+ enable_lmcache: bool = False
297
359
 
298
360
  # Double Sparsity
299
361
  enable_double_sparsity: bool = False
@@ -325,11 +387,13 @@ class ServerArgs:
325
387
  disable_outlines_disk_cache: bool = False
326
388
  disable_custom_all_reduce: bool = False
327
389
  enable_mscclpp: bool = False
390
+ enable_torch_symm_mem: bool = False
328
391
  disable_overlap_schedule: bool = False
329
392
  enable_mixed_chunk: bool = False
330
393
  enable_dp_attention: bool = False
331
394
  enable_dp_lm_head: bool = False
332
395
  enable_two_batch_overlap: bool = False
396
+ enable_single_batch_overlap: bool = False
333
397
  tbo_token_distribution_threshold: float = 0.48
334
398
  enable_torch_compile: bool = False
335
399
  torch_compile_max_bs: int = 32
@@ -338,17 +402,27 @@ class ServerArgs:
338
402
  enable_p2p_check: bool = False
339
403
  triton_attention_reduce_in_fp32: bool = False
340
404
  triton_attention_num_kv_splits: int = 8
405
+ triton_attention_split_tile_size: Optional[int] = None
341
406
  num_continuous_decode_steps: int = 1
342
407
  delete_ckpt_after_loading: bool = False
343
408
  enable_memory_saver: bool = False
409
+ enable_weights_cpu_backup: bool = False
344
410
  allow_auto_truncate: bool = False
345
411
  enable_custom_logit_processor: bool = False
346
412
  flashinfer_mla_disable_ragged: bool = False
347
413
  disable_shared_experts_fusion: bool = False
348
414
  disable_chunked_prefix_cache: bool = False
349
415
  disable_fast_image_processor: bool = False
416
+ keep_mm_feature_on_device: bool = False
350
417
  enable_return_hidden_states: bool = False
351
418
  scheduler_recv_interval: int = 1
419
+ numa_node: Optional[List[int]] = None
420
+ enable_deterministic_inference: bool = False
421
+
422
+ # Dynamic batch tokenizer
423
+ enable_dynamic_batch_tokenizer: bool = False
424
+ dynamic_batch_tokenizer_batch_size: int = 32
425
+ dynamic_batch_tokenizer_batch_timeout: float = 0.002
352
426
 
353
427
  # Debug tensor dumps
354
428
  debug_tensor_dump_output_folder: Optional[str] = None
@@ -357,66 +431,105 @@ class ServerArgs:
357
431
  debug_tensor_dump_prefill_only: bool = False
358
432
 
359
433
  # PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
360
- disaggregation_mode: str = "null"
434
+ disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
361
435
  disaggregation_transfer_backend: str = "mooncake"
362
436
  disaggregation_bootstrap_port: int = 8998
363
437
  disaggregation_decode_tp: Optional[int] = None
364
438
  disaggregation_decode_dp: Optional[int] = None
365
439
  disaggregation_prefill_pp: Optional[int] = 1
366
440
  disaggregation_ib_device: Optional[str] = None
441
+ disaggregation_decode_enable_offload_kvcache: bool = False
367
442
  num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
368
- pdlb_url: Optional[str] = None
443
+ # FIXME: hack to reduce ITL when decode bs is small
444
+ disaggregation_decode_polling_interval: int = 1
369
445
 
370
- # For model weight update
446
+ # For model weight update and weight loading
371
447
  custom_weight_loader: Optional[List[str]] = None
372
448
  weight_loader_disable_mmap: bool = False
449
+ remote_instance_weight_loader_seed_instance_ip: Optional[str] = None
450
+ remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
451
+ remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None
373
452
 
374
453
  # For PD-Multiplexing
375
454
  enable_pdmux: bool = False
376
455
  sm_group_num: int = 3
377
456
 
378
- # Deprecated arguments
379
- enable_ep_moe: bool = False
380
- enable_deepep_moe: bool = False
381
- enable_flashinfer_cutlass_moe: bool = False
382
- enable_flashinfer_trtllm_moe: bool = False
383
- enable_triton_kernel_moe: bool = False
384
- enable_flashinfer_mxfp4_moe: bool = False
385
-
386
457
  def __post_init__(self):
387
- # Check deprecated arguments
388
- if self.enable_ep_moe:
389
- self.ep_size = self.tp_size
390
- print_deprecated_warning(
391
- "NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
392
- )
393
- if self.enable_deepep_moe:
394
- self.moe_a2a_backend = "deepep"
395
- print_deprecated_warning(
396
- "NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
397
- )
398
- if self.enable_triton_kernel_moe:
399
- self.moe_runner_backend = "triton_kernel"
400
- print_deprecated_warning(
401
- "NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
402
- )
403
- if self.enable_flashinfer_cutlass_moe:
404
- self.moe_runner_backend = "flashinfer_cutlass"
405
- print_deprecated_warning(
406
- "NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead."
407
- )
408
- if self.enable_flashinfer_trtllm_moe:
409
- self.moe_runner_backend = "flashinfer_trtllm"
410
- print_deprecated_warning(
411
- "NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead."
412
- )
413
- if self.enable_flashinfer_mxfp4_moe:
414
- self.moe_runner_backend = "flashinfer_mxfp4"
415
- print_deprecated_warning(
416
- "NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead."
417
- )
458
+ """
459
+ Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
460
+ """
461
+ # Handle deprecated arguments.
462
+ self._handle_deprecated_args()
463
+
464
+ # Set missing default values.
465
+ self._handle_missing_default_values()
466
+
467
+ # Get GPU memory capacity, which is a common dependency for several configuration steps.
468
+ gpu_mem = get_device_memory_capacity(self.device)
469
+
470
+ # Handle memory-related, chunked prefill, and CUDA graph batch size configurations.
471
+ self._handle_gpu_memory_settings(gpu_mem)
472
+
473
+ # Handle device-specific backends.
474
+ self._handle_hpu_backends()
475
+ self._handle_cpu_backends()
476
+
477
+ # Apply model-specific adjustments.
478
+ self._handle_model_specific_adjustments()
479
+
480
+ # Set kernel backends.
481
+ self._handle_sampling_backend()
482
+ self._handle_attention_backend_compatibility()
483
+ self._handle_page_size()
484
+ self._handle_amd_specifics()
485
+ self._handle_grammar_backend()
486
+
487
+ # Handle data parallelism.
488
+ self._handle_data_parallelism()
489
+
490
+ # Handle MoE configurations.
491
+ self._handle_moe_kernel_config()
492
+ self._handle_deepep_moe()
493
+ self._handle_eplb_and_dispatch()
494
+ self._handle_expert_distribution_metrics()
418
495
 
419
- # Set missing default values
496
+ # Handle pipeline parallelism.
497
+ self._handle_pipeline_parallelism()
498
+
499
+ # Handle Hicache settings.
500
+ self._handle_hicache()
501
+
502
+ # Handle speculative decoding logic.
503
+ self._handle_speculative_decoding()
504
+
505
+ # Handle model loading format.
506
+ self._handle_load_format()
507
+
508
+ # Handle PD disaggregation.
509
+ self._handle_disaggregation()
510
+
511
+ # Validate tokenizer settings.
512
+ self._handle_tokenizer_batching()
513
+
514
+ # Propagate environment variables.
515
+ self._handle_environment_variables()
516
+
517
+ # Validate cache settings.
518
+ self._handle_cache_compatibility()
519
+
520
+ # Validate metrics labels.
521
+ self._handle_metrics_labels()
522
+
523
+ # Handle deterministic inference.
524
+ self._handle_deterministic_inference()
525
+
526
+ # Handle any other necessary validations.
527
+ self._handle_other_validations()
528
+
529
+ def _handle_deprecated_args(self):
530
+ pass
531
+
532
+ def _handle_missing_default_values(self):
420
533
  if self.tokenizer_path is None:
421
534
  self.tokenizer_path = self.model_path
422
535
  if self.served_model_name is None:
@@ -426,51 +539,140 @@ class ServerArgs:
426
539
  if self.random_seed is None:
427
540
  self.random_seed = random.randint(0, 1 << 30)
428
541
 
429
- gpu_mem = get_device_memory_capacity(self.device)
542
+ def _handle_gpu_memory_settings(self, gpu_mem):
543
+ """
544
+ Configure GPU memory-dependent settings including
545
+ chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static.
546
+
547
+ Here are our heuristics:
548
+ - Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity.
549
+ This is because GPUs with more memory are generally more powerful, we need to use a larger
550
+ chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU.
551
+ - Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs.
552
+
553
+ GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
554
+
555
+ The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity,
556
+ or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity.
557
+
558
+ In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers.
559
+ The activation memory is proportional to the chunked_prefill_size.
560
+ The cuda graph memory is proportional to the cuda_graph_max_bs.
561
+ We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB.
562
+ and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity.
563
+
564
+ The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run.
565
+ """
566
+ if gpu_mem is not None:
567
+ if gpu_mem < 20 * 1024:
568
+ # T4, 4080
569
+ # (chunked_prefill_size 2k, cuda_graph_max_bs 8)
570
+ if self.chunked_prefill_size is None:
571
+ self.chunked_prefill_size = 2048
572
+ if self.cuda_graph_max_bs is None:
573
+ self.cuda_graph_max_bs = 8
574
+ elif gpu_mem < 35 * 1024:
575
+ # A10, 4090, 5090
576
+ # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
577
+ if self.chunked_prefill_size is None:
578
+ self.chunked_prefill_size = 2048
579
+ if self.cuda_graph_max_bs is None:
580
+ # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM < 35GB, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
581
+ # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
582
+ # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
583
+ if self.tp_size < 4:
584
+ self.cuda_graph_max_bs = 16
585
+ else:
586
+ self.cuda_graph_max_bs = 80
587
+ elif gpu_mem < 60 * 1024:
588
+ # A100 (40GB), L40,
589
+ # (chunked_prefill_size 4k, cuda_graph_max_bs 32 if tp < 4 else 160)
590
+ if self.chunked_prefill_size is None:
591
+ self.chunked_prefill_size = 4096
592
+ if self.cuda_graph_max_bs is None:
593
+ if self.tp_size < 4:
594
+ self.cuda_graph_max_bs = 32
595
+ else:
596
+ self.cuda_graph_max_bs = 160
597
+ elif gpu_mem < 90 * 1024:
598
+ # H100, A100
599
+ # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
600
+ if self.chunked_prefill_size is None:
601
+ self.chunked_prefill_size = 8192
602
+ if self.cuda_graph_max_bs is None:
603
+ if self.tp_size < 4:
604
+ self.cuda_graph_max_bs = 256
605
+ else:
606
+ self.cuda_graph_max_bs = 512
607
+ elif gpu_mem < 160 * 1024:
608
+ # H20, H200
609
+ # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
610
+ if self.chunked_prefill_size is None:
611
+ self.chunked_prefill_size = 8192
612
+ if self.cuda_graph_max_bs is None:
613
+ if self.tp_size < 4:
614
+ self.cuda_graph_max_bs = 256
615
+ else:
616
+ self.cuda_graph_max_bs = 512
617
+ else:
618
+ # B200, MI300
619
+ # (chunked_prefill_size 16k, cuda_graph_max_bs 512)
620
+ if self.chunked_prefill_size is None:
621
+ self.chunked_prefill_size = 16384
622
+ if self.cuda_graph_max_bs is None:
623
+ self.cuda_graph_max_bs = 512
624
+ else:
625
+ # Fallback defaults when gpu_mem is None
626
+ if self.chunked_prefill_size is None:
627
+ self.chunked_prefill_size = 4096
628
+ if self.cuda_graph_max_bs is None:
629
+ self.cuda_graph_max_bs = 160
430
630
 
431
- # Set mem fraction static
432
- if self.mem_fraction_static is None:
433
- if gpu_mem is not None:
434
- # GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
435
- # mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
436
-
437
- # We want mem_fraction_static to be as large as possible but still has enough room
438
- # for activations and cuda graph buffers. We use the following heuristic to
439
- # compute the needed size for activations and cuda graph buffers:
440
- # - The size of the activation depends on the chunked_prefill_size and model size.
441
- # - The size of cuda graph buffers depends on the cuda graph capture range and model size.
442
- # For GPUs with more memory, we use a larger chunked_prefill_size and
443
- # capture more cuda graphs, so they need to reserve more memory.
444
- parallel_size = self.tp_size * self.pp_size
445
-
446
- if gpu_mem < 20 * 1024:
447
- # T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
448
- reserved_mem = (2.8 + parallel_size / 10) * 1024
449
- elif gpu_mem < 35 * 1024:
450
- # A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
451
- reserved_mem = (2.8 + parallel_size / 10) * 1024
452
- elif gpu_mem < 90 * 1024:
453
- # H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
454
- reserved_mem = (9.5 + parallel_size / 2) * 1024
455
- elif gpu_mem < 100 * 1024:
456
- # H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
457
- reserved_mem = (12 + parallel_size / 2) * 1024
458
- elif gpu_mem < 160 * 1024:
459
- # H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
460
- reserved_mem = (12 + parallel_size / 2) * 1024
461
- else:
462
- # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
463
- reserved_mem = 32 * 1024
631
+ # Set cuda graph batch sizes
632
+ if self.cuda_graph_bs is None:
633
+ self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes()
634
+ else:
635
+ self.cuda_graph_max_bs = max(self.cuda_graph_bs)
464
636
 
465
- if self.speculative_algorithm is not None:
466
- # draft model and larger cuda graph buffers
637
+ if self.mem_fraction_static is None:
638
+ # Constant meta data (e.g., from attention backend)
639
+ reserved_mem = 512
640
+ # For activation during large prefill
641
+ if self.chunked_prefill_size > 0:
642
+ reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
643
+ else:
644
+ reserved_mem += max(self.max_prefill_tokens, 2048) * 1.5
645
+ # For cuda graphs
646
+ reserved_mem += self.cuda_graph_max_bs * 2
647
+ # Some adjustments for large parallel size
648
+ reserved_mem += self.tp_size * self.pp_size / 8 * 1024
649
+
650
+ if self.enable_dp_attention:
651
+ # DP attention needs more padding for some operations
652
+ reserved_mem += self.cuda_graph_max_bs * self.dp_size * 3
653
+
654
+ # DP attention uses much more memory for large cuda graph max bs,
655
+ # likely due to some inefficiencies in torch allocator or our implementation.
656
+ # So we need to reserve more memory.
657
+ if self.cuda_graph_max_bs > 300:
658
+ reserved_mem += self.cuda_graph_max_bs * self.dp_size * 1.5
659
+
660
+ if gpu_mem is not None and gpu_mem > 60 * 1024:
661
+ reserved_mem = max(reserved_mem, 10 * 1024)
662
+
663
+ if self.speculative_algorithm is not None:
664
+ if self.speculative_algorithm == "STANDALONE":
665
+ # standalonedraft model and cuda graphs
666
+ reserved_mem += 6 * 1024
667
+ elif self.speculative_algorithm != "NGRAM":
668
+ # eagle draft models and cuda graphs
467
669
  reserved_mem += 2 * 1024
468
- if self.enable_dp_attention:
469
- reserved_mem += 4 * 1024
470
670
 
471
- self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
472
- else:
473
- self.mem_fraction_static = 0.88
671
+ self.mem_fraction_static = (
672
+ round((gpu_mem - reserved_mem) / gpu_mem, 3)
673
+ if gpu_mem is not None
674
+ else 0.88
675
+ )
474
676
 
475
677
  # Lazy init to avoid circular import
476
678
  # Multimodal models need more memory for the image processor
@@ -480,53 +682,192 @@ class ServerArgs:
480
682
  if model_config.is_multimodal:
481
683
  self.adjust_mem_fraction_for_vlm(model_config)
482
684
 
483
- # Set chunked prefill size, which depends on the gpu memory capacity
484
- if self.chunked_prefill_size is None:
485
- if gpu_mem is not None:
486
- if gpu_mem < 35 * 1024: # A10, L40, 4090
487
- self.chunked_prefill_size = 2048
488
- elif gpu_mem < 160 * 1024: # H100, H200, A100, H20
489
- self.chunked_prefill_size = 8192
490
- else: # B200, MI300
491
- self.chunked_prefill_size = 16384
492
- else:
493
- self.chunked_prefill_size = 4096
685
+ def _generate_cuda_graph_batch_sizes(self):
686
+ """
687
+ Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs.
688
+ This integrates the logic from cuda_graph_runner.py.
689
+ """
690
+ # Handle disable_cuda_graph_padding as the first condition for both spec and non-spec
691
+ if self.disable_cuda_graph_padding:
692
+ capture_bs = list(range(1, self.cuda_graph_max_bs + 1))
693
+ elif self.speculative_algorithm is None:
694
+ # Normal case: [1, 2, 4, 8, 12] + list(range(16, 257, 8)) + list(range(272, 512, 16)) + list(range(512, cuda_graph_max_bs + 1))
695
+ capture_bs = (
696
+ [1, 2, 4, 8, 12]
697
+ + list(range(16, 257, 8))
698
+ + list(range(272, 512, 16))
699
+ + list(range(512, self.cuda_graph_max_bs + 1, 32))
700
+ )
701
+ else:
702
+ # Spec decoding case: list(range(1, 9, 1)) + list(range(10, 33, 2)) + list(range(40, 64, 4)) + list(range(72, 257, 8))
703
+ capture_bs = (
704
+ list(range(1, 9, 1))
705
+ + list(range(10, 33, 2))
706
+ + list(range(40, 64, 4))
707
+ + list(range(72, 257, 8))
708
+ + list(range(272, self.cuda_graph_max_bs + 1, 16))
709
+ )
494
710
 
495
- # Set cuda graph max batch size
496
- if self.cuda_graph_max_bs is None:
497
- # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
498
- if gpu_mem is not None and gpu_mem < 35 * 1024:
499
- if self.tp_size < 4:
500
- self.cuda_graph_max_bs = 8
501
- else:
502
- self.cuda_graph_max_bs = 80
711
+ capture_bs = [bs for bs in capture_bs if bs <= self.cuda_graph_max_bs]
712
+
713
+ return capture_bs
503
714
 
504
- # Set kernel backends for hpu device
715
+ def _handle_hpu_backends(self):
505
716
  if self.device == "hpu":
506
717
  self.attention_backend = "torch_native"
507
718
  self.sampling_backend = "pytorch"
508
719
 
509
- # Model-specific adjustments
510
- self.model_specific_adjustments()
511
-
512
- # Set kernel backends
720
+ def _handle_cpu_backends(self):
513
721
  if self.device == "cpu":
514
722
  if self.attention_backend is None:
515
723
  self.attention_backend = "intel_amx"
516
724
  self.sampling_backend = "pytorch"
517
725
 
726
+ def _handle_model_specific_adjustments(self):
727
+ from sglang.srt.configs.model_config import is_deepseek_nsa
728
+
729
+ if parse_connector_type(self.model_path) == ConnectorType.INSTANCE:
730
+ return
731
+
732
+ hf_config = self.get_hf_config()
733
+ model_arch = hf_config.architectures[0]
734
+ if model_arch in ["GptOssForCausalLM"]:
735
+ if self.attention_backend is None:
736
+ if is_cuda() and is_sm100_supported():
737
+ self.attention_backend = "trtllm_mha"
738
+ elif is_cuda() and is_sm90_supported():
739
+ self.attention_backend = "fa3"
740
+ else:
741
+ self.attention_backend = "triton"
742
+ supported_backends = ["triton", "trtllm_mha", "fa3"]
743
+ logger.info(
744
+ f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
745
+ )
746
+ assert (
747
+ self.attention_backend in supported_backends
748
+ ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
749
+
750
+ if is_sm100_supported():
751
+ if not self.enable_dp_attention:
752
+ self.enable_flashinfer_allreduce_fusion = True
753
+ logger.info(
754
+ "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
755
+ )
756
+ quantization_config = getattr(hf_config, "quantization_config", None)
757
+ is_mxfp4_quant_format = (
758
+ quantization_config is not None
759
+ and quantization_config.get("quant_method") == "mxfp4"
760
+ )
761
+
762
+ if is_sm100_supported() and is_mxfp4_quant_format:
763
+ self.moe_runner_backend = "flashinfer_mxfp4"
764
+ logger.warning(
765
+ "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
766
+ )
767
+ else:
768
+ if self.moe_runner_backend == "triton_kernel":
769
+ assert (
770
+ self.ep_size == 1
771
+ ), "Triton kernel MoE is only supported when ep_size == 1"
772
+ if (
773
+ self.moe_runner_backend == "auto"
774
+ and self.ep_size == 1
775
+ and is_triton_kernels_available()
776
+ ):
777
+ self.moe_runner_backend = "triton_kernel"
778
+ logger.warning(
779
+ "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
780
+ )
781
+ self.disable_hybrid_swa_memory = True
782
+ if is_mxfp4_quant_format:
783
+ # use bf16 for mxfp4 triton kernels
784
+ self.dtype = "bfloat16"
785
+
786
+ elif "Llama4" in model_arch and self.device != "cpu":
787
+ assert self.attention_backend in {
788
+ "fa3",
789
+ "aiter",
790
+ "triton",
791
+ }, "fa3, aiter, or triton is required for Llama4 model"
792
+ elif model_arch in [
793
+ "Gemma2ForCausalLM",
794
+ "Gemma3ForCausalLM",
795
+ "Gemma3ForConditionalGeneration",
796
+ "Gemma3nForCausalLM",
797
+ "Gemma3nForConditionalGeneration",
798
+ ]:
799
+ # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
800
+ # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
801
+ logger.warning(
802
+ f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
803
+ )
804
+ self.disable_hybrid_swa_memory = True
805
+
806
+ if is_deepseek_nsa(hf_config):
807
+ if (
808
+ self.attention_backend is None
809
+ and self.prefill_attention_backend is None
810
+ and self.decode_attention_backend is None
811
+ ):
812
+ self.attention_backend = "nsa"
813
+ logger.warning("Set nsa attention backend for DeepSeek NSA.")
814
+
815
+ if not is_npu():
816
+ self.enable_dp_attention = True
817
+ self.dp_size = self.tp_size
818
+ logger.warning("DP attention is enabled for DeepSeek NSA.")
819
+
820
+ self.page_size = 64
821
+ logger.warning("Setting page size to 64 for DeepSeek NSA.")
822
+
823
+ self.mem_fraction_static = 0.8
824
+ logger.warning("Setting mem fraction static to 0.8 for DeepSeek NSA.")
825
+
826
+ # For Hopper, we support both bf16 and fp8 kv cache; for Blackwell, we support fp8 only currently
827
+ import torch
828
+
829
+ major, _ = torch.cuda.get_device_capability()
830
+ if major >= 10:
831
+ self.kv_cache_dtype = "fp8_e4m3"
832
+ logger.warning("Setting KV cache dtype to fp8.")
833
+
834
+ if self.kv_cache_dtype == "fp8_e4m3":
835
+ self.nsa_prefill = "flashmla_decode"
836
+ self.nsa_decode = "flashmla_decode"
837
+ logger.warning(
838
+ "Setting NSA backend to flashmla_decode for FP8 KV Cache."
839
+ )
840
+
841
+ # Logging env vars for NSA
842
+ from sglang.srt.layers.attention.nsa.utils import (
843
+ print_nsa_bool_env_vars,
844
+ )
845
+
846
+ print_nsa_bool_env_vars()
847
+
848
+ def _handle_sampling_backend(self):
518
849
  if self.sampling_backend is None:
519
850
  self.sampling_backend = (
520
851
  "flashinfer" if is_flashinfer_available() else "pytorch"
521
852
  )
522
853
 
854
+ def _handle_attention_backend_compatibility(self):
523
855
  if self.attention_backend == "torch_native":
524
856
  logger.warning(
525
857
  "Cuda graph is disabled because of using torch native attention backend"
526
858
  )
527
859
  self.disable_cuda_graph = True
528
860
 
529
- if self.attention_backend == "ascend":
861
+ if self.attention_backend == "flex_attention":
862
+ logger.warning(
863
+ "Cuda graph is disabled because of using torch Flex Attention backend"
864
+ )
865
+ self.disable_cuda_graph = True
866
+ assert (
867
+ self.speculative_algorithm is None
868
+ ), "Speculative decoding is currently not supported with Flex Attention backend"
869
+
870
+ if is_npu() and self.attention_backend in ["ascend"]:
530
871
  logger.warning(
531
872
  "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
532
873
  )
@@ -588,30 +929,30 @@ class ServerArgs:
588
929
 
589
930
  if self.attention_backend == "dual_chunk_flash_attn":
590
931
  logger.warning(
591
- "Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
932
+ "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
592
933
  )
593
934
  self.enable_mixed_chunk = False
594
- self.disable_cuda_graph = True
595
935
  self.disable_radix_cache = True
596
936
 
597
- # Set page size
937
+ def _handle_page_size(self):
598
938
  if self.page_size is None:
599
939
  self.page_size = 1
600
940
 
601
- # AMD-specific Triton attention KV splits default number
941
+ def _handle_amd_specifics(self):
602
942
  if is_hip():
603
943
  self.triton_attention_num_kv_splits = 16
604
944
 
605
- # Choose grammar backend
945
+ def _handle_grammar_backend(self):
606
946
  if self.grammar_backend is None:
607
947
  self.grammar_backend = "xgrammar"
608
948
 
609
- # Data parallelism attention
949
+ def _handle_data_parallelism(self):
950
+ if self.dp_size == 1:
951
+ self.enable_dp_attention = False
952
+ self.enable_dp_lm_head = False
953
+
610
954
  if self.enable_dp_attention:
611
955
  self.schedule_conservativeness = self.schedule_conservativeness * 0.3
612
- assert (
613
- self.dp_size > 1
614
- ), "Please set a dp-size > 1. You can use 1 < dp-size <= tp-size "
615
956
  assert self.tp_size % self.dp_size == 0
616
957
  self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
617
958
  logger.warning(
@@ -623,7 +964,7 @@ class ServerArgs:
623
964
  self.enable_dp_attention
624
965
  ), "Please enable dp attention when setting enable_dp_lm_head. "
625
966
 
626
- # MoE kernel
967
+ def _handle_moe_kernel_config(self):
627
968
  if self.moe_runner_backend == "flashinfer_cutlass":
628
969
  assert (
629
970
  self.quantization == "modelopt_fp4"
@@ -634,13 +975,15 @@ class ServerArgs:
634
975
  ], "The expert parallel size must be 1 or the same as the tensor parallel size"
635
976
 
636
977
  if self.moe_runner_backend == "flashinfer_trtllm":
637
- if not self.disable_shared_experts_fusion:
638
- self.disable_shared_experts_fusion = True
639
- logger.warning(
640
- "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
641
- )
978
+ assert (
979
+ self.quantization == "modelopt_fp4" or self.quantization == "fp8"
980
+ ), "modelopt_fp4 or fp8 quantization is required for Flashinfer TRTLLM MoE"
981
+ self.disable_shared_experts_fusion = True
982
+ logger.warning(
983
+ "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
984
+ )
642
985
 
643
- # DeepEP MoE
986
+ def _handle_deepep_moe(self):
644
987
  if self.moe_a2a_backend == "deepep":
645
988
  if self.deepep_mode == "normal":
646
989
  logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
@@ -650,6 +993,7 @@ class ServerArgs:
650
993
  f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
651
994
  )
652
995
 
996
+ def _handle_eplb_and_dispatch(self):
653
997
  if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
654
998
  self.expert_distribution_recorder_mode = "stat"
655
999
  logger.warning(
@@ -664,6 +1008,7 @@ class ServerArgs:
664
1008
  if self.enable_eplb:
665
1009
  assert self.ep_size > 1
666
1010
 
1011
+ def _handle_expert_distribution_metrics(self):
667
1012
  if self.enable_expert_distribution_metrics and (
668
1013
  self.expert_distribution_recorder_mode is None
669
1014
  ):
@@ -675,25 +1020,42 @@ class ServerArgs:
675
1020
  elif self.expert_distribution_recorder_mode is not None:
676
1021
  self.expert_distribution_recorder_buffer_size = 1000
677
1022
 
678
- # Pipeline parallelism
1023
+ def _handle_pipeline_parallelism(self):
679
1024
  if self.pp_size > 1:
680
1025
  self.disable_overlap_schedule = True
681
1026
  logger.warning(
682
1027
  "Pipeline parallelism is incompatible with overlap schedule."
683
1028
  )
684
1029
 
685
- # Hicache
1030
+ def _handle_hicache(self):
686
1031
  if self.hicache_storage_backend == "mooncake":
687
- # to use mooncake storage backend, the following conditions must be met:
688
- self.hicache_io_backend = "kernel"
689
- self.hicache_mem_layout = "page_first"
1032
+ if self.hicache_mem_layout == "layer_first":
1033
+ if self.hicache_io_backend == "direct":
1034
+ self.hicache_mem_layout = "page_first_direct"
1035
+ elif self.hicache_io_backend == "kernel":
1036
+ self.hicache_mem_layout = "page_first"
1037
+ logger.warning(
1038
+ f"Mooncake storage backend does not support layer_first layout, "
1039
+ f"switching to {self.hicache_mem_layout} layout for {self.hicache_io_backend} io backend"
1040
+ )
690
1041
 
691
- # Speculative Decoding
1042
+ if self.hicache_mem_layout == "page_first_direct":
1043
+ if self.hicache_io_backend != "direct":
1044
+ self.hicache_io_backend = "direct"
1045
+ logger.warning(
1046
+ "Page first direct layout only support direct io backend"
1047
+ )
1048
+
1049
+ def _handle_speculative_decoding(self):
692
1050
  if self.speculative_algorithm == "NEXTN":
693
- # NEXTN shares the same implementation of EAGLE
694
1051
  self.speculative_algorithm = "EAGLE"
695
1052
 
696
- if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
1053
+ if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
1054
+ if self.speculative_algorithm == "STANDALONE" and self.enable_dp_attention:
1055
+ # TODO: support dp attention for standalone speculative decoding
1056
+ raise ValueError(
1057
+ "Currently standalone speculative decoding does not support dp attention."
1058
+ )
697
1059
  if self.max_running_requests is None:
698
1060
  self.max_running_requests = 48
699
1061
  self.disable_overlap_schedule = True
@@ -709,8 +1071,13 @@ class ServerArgs:
709
1071
  )
710
1072
 
711
1073
  model_arch = self.get_hf_config().architectures[0]
712
- if model_arch in ["DeepseekV3ForCausalLM", "Glm4MoeForCausalLM"]:
713
- # Auto set draft_model_path DeepSeek-V3/R1
1074
+ if model_arch in [
1075
+ "DeepseekV32ForCausalLM",
1076
+ "DeepseekV3ForCausalLM",
1077
+ "Glm4MoeForCausalLM",
1078
+ "BailingMoeForCausalLM",
1079
+ "BailingMoeV2ForCausalLM",
1080
+ ]:
714
1081
  if self.speculative_draft_model_path is None:
715
1082
  self.speculative_draft_model_path = self.model_path
716
1083
  else:
@@ -718,7 +1085,6 @@ class ServerArgs:
718
1085
  "DeepSeek MTP does not require setting speculative_draft_model_path."
719
1086
  )
720
1087
 
721
- # Auto choose parameters
722
1088
  if self.speculative_num_steps is None:
723
1089
  assert (
724
1090
  self.speculative_eagle_topk is None
@@ -758,23 +1124,63 @@ class ServerArgs:
758
1124
  "speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
759
1125
  )
760
1126
 
761
- # The token generated from the verify step is counted.
762
- # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
763
- # assert self.speculative_num_steps < self.speculative_num_draft_tokens
1127
+ if self.speculative_algorithm == "NGRAM":
1128
+ if not self.device.startswith("cuda"):
1129
+ raise ValueError(
1130
+ "Ngram speculative decoding only supports CUDA device."
1131
+ )
1132
+ if self.max_running_requests is None:
1133
+ self.max_running_requests = 48
1134
+ self.disable_overlap_schedule = True
1135
+ self.enable_mixed_chunk = False
1136
+ self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
1137
+ if self.speculative_num_draft_tokens is None:
1138
+ self.speculative_num_draft_tokens = (
1139
+ self.speculative_ngram_max_match_window_size
1140
+ )
1141
+ logger.warning(
1142
+ "The overlap scheduler and mixed chunked prefill are disabled because of "
1143
+ "using ngram speculative decoding."
1144
+ )
1145
+
1146
+ if (
1147
+ self.speculative_eagle_topk > 1
1148
+ and self.page_size > 1
1149
+ and self.attention_backend != "flashinfer"
1150
+ ):
1151
+ raise ValueError(
1152
+ f"speculative_eagle_topk({self.speculative_eagle_topk}) > 1 "
1153
+ f"with page_size({self.page_size}) > 1 is unstable "
1154
+ "and produces incorrect results for paged attention backends. "
1155
+ "This combination is only supported for the 'flashinfer' backend."
1156
+ )
1157
+ if self.enable_dp_attention:
1158
+ # TODO: support dp attention for ngram speculative decoding
1159
+ raise ValueError(
1160
+ "Currently ngram speculative decoding does not support dp attention."
1161
+ )
764
1162
 
765
- # GGUF
1163
+ def _handle_load_format(self):
766
1164
  if (
767
1165
  self.load_format == "auto" or self.load_format == "gguf"
768
1166
  ) and check_gguf_file(self.model_path):
769
1167
  self.quantization = self.load_format = "gguf"
770
1168
 
771
- # Model loading
772
1169
  if is_remote_url(self.model_path):
773
1170
  self.load_format = "remote"
1171
+
774
1172
  if self.custom_weight_loader is None:
775
1173
  self.custom_weight_loader = []
776
1174
 
777
- # PD disaggregation
1175
+ if self.load_format == "remote_instance":
1176
+ if (
1177
+ self.remote_instance_weight_loader_seed_instance_ip is None
1178
+ or self.remote_instance_weight_loader_seed_instance_service_port is None
1179
+ or self.remote_instance_weight_loader_send_weights_group_ports is None
1180
+ ):
1181
+ self.load_format = "auto"
1182
+
1183
+ def _handle_disaggregation(self):
778
1184
  if self.disaggregation_mode == "decode":
779
1185
  assert (
780
1186
  self.disaggregation_decode_tp is None
@@ -785,6 +1191,13 @@ class ServerArgs:
785
1191
 
786
1192
  self.disable_radix_cache = True
787
1193
  logger.warning("KV cache is forced as chunk cache for decode server")
1194
+
1195
+ if self.dp_size > 1 and not is_in_ci():
1196
+ assert self.prefill_round_robin_balance, (
1197
+ "Prefill round robin balance is required when dp size > 1. "
1198
+ "Please make sure that the prefill instance is launched with `--load-balance-method round_robin`"
1199
+ " and `--prefill-round-robin-balance` is set for decode server."
1200
+ )
788
1201
  elif self.disaggregation_mode == "prefill":
789
1202
  if self.disaggregation_decode_tp is None:
790
1203
  self.disaggregation_decode_tp = self.tp_size
@@ -793,25 +1206,84 @@ class ServerArgs:
793
1206
 
794
1207
  self.disaggregation_prefill_pp = self.pp_size
795
1208
  self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
796
-
797
1209
  self.disable_cuda_graph = True
798
1210
  logger.warning("Cuda graph is disabled for prefill server")
799
1211
 
800
- # Propagate env vars
1212
+ def _handle_tokenizer_batching(self):
1213
+ if self.enable_tokenizer_batch_encode and self.enable_dynamic_batch_tokenizer:
1214
+ raise ValueError(
1215
+ "Cannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. "
1216
+ "Please choose one tokenizer batching approach."
1217
+ )
1218
+
1219
+ def _handle_environment_variables(self):
801
1220
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
802
1221
  "1" if self.enable_torch_compile else "0"
803
1222
  )
804
- # Set env var before grammar backends init
1223
+ os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
805
1224
  os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
806
1225
  "1" if self.disable_outlines_disk_cache else "0"
807
1226
  )
1227
+ os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = (
1228
+ "1" if self.enable_deterministic_inference else "0"
1229
+ )
808
1230
 
1231
+ def _handle_cache_compatibility(self):
809
1232
  if self.enable_hierarchical_cache and self.disable_radix_cache:
810
1233
  raise ValueError(
811
1234
  "The arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive "
812
1235
  "and cannot be used at the same time. Please use only one of them."
813
1236
  )
814
1237
 
1238
+ if (
1239
+ self.disaggregation_decode_enable_offload_kvcache
1240
+ and self.disaggregation_mode != "decode"
1241
+ ):
1242
+ raise ValueError(
1243
+ "The argument disaggregation-decode-enable-offload-kvcache is only supported for decode side."
1244
+ )
1245
+
1246
+ def _handle_metrics_labels(self):
1247
+ if (
1248
+ not self.tokenizer_metrics_custom_labels_header
1249
+ and self.tokenizer_metrics_allowed_custom_labels
1250
+ ):
1251
+ raise ValueError(
1252
+ "Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-custom-labels."
1253
+ )
1254
+
1255
+ def _handle_deterministic_inference(self):
1256
+ if self.enable_deterministic_inference:
1257
+ # Check sampling backend
1258
+ self.sampling_backend = "pytorch"
1259
+ logger.warning(
1260
+ "Sampling backend is set to pytorch for deterministic inference."
1261
+ )
1262
+
1263
+ # Check attention backend
1264
+ if self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
1265
+ raise ValueError(
1266
+ f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference."
1267
+ )
1268
+
1269
+ # Currently, only FA3 supports radix cache. Support for other backends is in progress
1270
+ if self.attention_backend != "fa3":
1271
+ self.disable_radix_cache = True
1272
+ logger.warning(
1273
+ f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
1274
+ )
1275
+
1276
+ # Check TP size
1277
+ if self.tp_size > 1:
1278
+ os.environ["NCCL_ALGO"] = "allreduce:tree"
1279
+ self.disable_custom_all_reduce = True
1280
+ logger.warning(
1281
+ "NCCL_ALGO is set to 'allreduce:tree' and custom all reduce is disabled for deterministic inference when TP size > 1."
1282
+ )
1283
+
1284
+ def _handle_other_validations(self):
1285
+ pass
1286
+
815
1287
  @staticmethod
816
1288
  def add_cli_args(parser: argparse.ArgumentParser):
817
1289
  # Model and tokenizer
@@ -828,12 +1300,6 @@ class ServerArgs:
828
1300
  default=ServerArgs.tokenizer_path,
829
1301
  help="The path of the tokenizer.",
830
1302
  )
831
- parser.add_argument(
832
- "--tokenizer-worker-num",
833
- type=int,
834
- default=ServerArgs.tokenizer_worker_num,
835
- help="The worker num of the tokenizer manager.",
836
- )
837
1303
  parser.add_argument(
838
1304
  "--tokenizer-mode",
839
1305
  type=str,
@@ -843,6 +1309,12 @@ class ServerArgs:
843
1309
  "tokenizer if available, and 'slow' will "
844
1310
  "always use the slow tokenizer.",
845
1311
  )
1312
+ parser.add_argument(
1313
+ "--tokenizer-worker-num",
1314
+ type=int,
1315
+ default=ServerArgs.tokenizer_worker_num,
1316
+ help="The worker num of the tokenizer manager.",
1317
+ )
846
1318
  parser.add_argument(
847
1319
  "--skip-tokenizer-init",
848
1320
  action="store_true",
@@ -990,6 +1462,11 @@ class ServerArgs:
990
1462
  choices=["auto", "fp8_e5m2", "fp8_e4m3"],
991
1463
  help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
992
1464
  )
1465
+ parser.add_argument(
1466
+ "--enable-fp32-lm-head",
1467
+ action="store_true",
1468
+ help="If set, the LM head outputs (logits) are in FP32.",
1469
+ )
993
1470
 
994
1471
  # Memory and scheduling
995
1472
  parser.add_argument(
@@ -1033,9 +1510,27 @@ class ServerArgs:
1033
1510
  "--schedule-policy",
1034
1511
  type=str,
1035
1512
  default=ServerArgs.schedule_policy,
1036
- choices=["lpm", "random", "fcfs", "dfs-weight", "lof"],
1513
+ choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
1037
1514
  help="The scheduling policy of the requests.",
1038
1515
  )
1516
+ parser.add_argument(
1517
+ "--enable-priority-scheduling",
1518
+ action="store_true",
1519
+ default=ServerArgs.enable_priority_scheduling,
1520
+ help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
1521
+ )
1522
+ parser.add_argument(
1523
+ "--schedule-low-priority-values-first",
1524
+ action="store_true",
1525
+ default=ServerArgs.schedule_low_priority_values_first,
1526
+ help="If specified with --enable-priority-scheduling, the scheduler will schedule requests with lower priority integer values first.",
1527
+ )
1528
+ parser.add_argument(
1529
+ "--priority-scheduling-preemption-threshold",
1530
+ type=int,
1531
+ default=ServerArgs.priority_scheduling_preemption_threshold,
1532
+ help="Minimum difference in priorities for an incoming request to have to preempt running request(s).",
1533
+ )
1039
1534
  parser.add_argument(
1040
1535
  "--schedule-conservativeness",
1041
1536
  type=float,
@@ -1207,6 +1702,21 @@ class ServerArgs:
1207
1702
  "to record request metrics separately. This is especially useful when dp_attention is enabled, as "
1208
1703
  "otherwise all metrics appear to come from TP 0.",
1209
1704
  )
1705
+ parser.add_argument(
1706
+ "--tokenizer-metrics-custom-labels-header",
1707
+ type=str,
1708
+ default=ServerArgs.tokenizer_metrics_custom_labels_header,
1709
+ help="Specify the HTTP header for passing custom labels for tokenizer metrics.",
1710
+ )
1711
+ parser.add_argument(
1712
+ "--tokenizer-metrics-allowed-custom-labels",
1713
+ type=str,
1714
+ nargs="+",
1715
+ default=ServerArgs.tokenizer_metrics_allowed_custom_labels,
1716
+ help="The custom labels allowed for tokenizer metrics. The labels are specified via a dict in "
1717
+ "'--tokenizer-metrics-custom-labels-header' field in HTTP requests, e.g., {'label1': 'value1', 'label2': "
1718
+ "'value2'} is allowed if '--tokenizer-metrics-allowed-custom-labels label1 label2' is set.",
1719
+ )
1210
1720
  parser.add_argument(
1211
1721
  "--bucket-time-to-first-token",
1212
1722
  type=float,
@@ -1234,6 +1744,26 @@ class ServerArgs:
1234
1744
  default=ServerArgs.collect_tokens_histogram,
1235
1745
  help="Collect prompt/generation tokens histogram.",
1236
1746
  )
1747
+ bucket_rule = (
1748
+ "Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
1749
+ "generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
1750
+ "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'custom <value1> "
1751
+ "<value2> ...' uses custom bucket values (e.g., 'custom 10 50 100 500')."
1752
+ )
1753
+ parser.add_argument(
1754
+ "--prompt-tokens-buckets",
1755
+ type=str,
1756
+ nargs="+",
1757
+ default=ServerArgs.prompt_tokens_buckets,
1758
+ help=f"The buckets rule of prompt tokens. {bucket_rule}",
1759
+ )
1760
+ parser.add_argument(
1761
+ "--generation-tokens-buckets",
1762
+ type=str,
1763
+ nargs="+",
1764
+ default=ServerArgs.generation_tokens_buckets,
1765
+ help=f"The buckets rule for generation tokens histogram. {bucket_rule}",
1766
+ )
1237
1767
  parser.add_argument(
1238
1768
  "--gc-warning-threshold-secs",
1239
1769
  type=float,
@@ -1258,6 +1788,17 @@ class ServerArgs:
1258
1788
  default=None,
1259
1789
  help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
1260
1790
  )
1791
+ parser.add_argument(
1792
+ "--enable-trace",
1793
+ action="store_true",
1794
+ help="Enable opentelemetry trace",
1795
+ )
1796
+ parser.add_argument(
1797
+ "--oltp-traces-endpoint",
1798
+ type=str,
1799
+ default="localhost:4317",
1800
+ help="Config opentelemetry collector endpoint if --enable-trace is set. format: <ip>:<port>",
1801
+ )
1261
1802
 
1262
1803
  # API related
1263
1804
  parser.add_argument(
@@ -1342,6 +1883,18 @@ class ServerArgs:
1342
1883
  "minimum_tokens",
1343
1884
  ],
1344
1885
  )
1886
+ parser.add_argument(
1887
+ "--load-watch-interval",
1888
+ type=float,
1889
+ default=ServerArgs.load_watch_interval,
1890
+ help="The interval of load watching in seconds.",
1891
+ )
1892
+ parser.add_argument(
1893
+ "--prefill-round-robin-balance",
1894
+ default=ServerArgs.prefill_round_robin_balance,
1895
+ action="store_true",
1896
+ help="Prefill is round robin balanced. This is used to promise decode server can get the correct dp rank.",
1897
+ )
1345
1898
 
1346
1899
  # Multi-node distributed serving
1347
1900
  parser.add_argument(
@@ -1416,9 +1969,17 @@ class ServerArgs:
1416
1969
  parser.add_argument(
1417
1970
  "--lora-backend",
1418
1971
  type=str,
1419
- default="triton",
1972
+ choices=LORA_BACKEND_CHOICES,
1973
+ default=ServerArgs.lora_backend,
1420
1974
  help="Choose the kernel backend for multi-LoRA serving.",
1421
1975
  )
1976
+ parser.add_argument(
1977
+ "--max-lora-chunk-size",
1978
+ type=int,
1979
+ default=ServerArgs.max_lora_chunk_size,
1980
+ choices=[16, 32, 64, 128],
1981
+ help="Maximum chunk size for the ChunkedSGMV LoRA backend. Only used when --lora-backend is 'csgmv'. Choosing a larger value might improve performance.",
1982
+ )
1422
1983
 
1423
1984
  # Kernel backend
1424
1985
  parser.add_argument(
@@ -1452,30 +2013,51 @@ class ServerArgs:
1452
2013
  parser.add_argument(
1453
2014
  "--grammar-backend",
1454
2015
  type=str,
1455
- choices=["xgrammar", "outlines", "llguidance", "none"],
2016
+ choices=GRAMMAR_BACKEND_CHOICES,
1456
2017
  default=ServerArgs.grammar_backend,
1457
2018
  help="Choose the backend for grammar-guided decoding.",
1458
2019
  )
1459
2020
  parser.add_argument(
1460
2021
  "--mm-attention-backend",
1461
2022
  type=str,
1462
- choices=["sdpa", "fa3", "triton_attn"],
2023
+ choices=["sdpa", "fa3", "triton_attn", "ascend_attn"],
1463
2024
  default=ServerArgs.mm_attention_backend,
1464
2025
  help="Set multimodal attention backend.",
1465
2026
  )
2027
+ parser.add_argument(
2028
+ "--nsa-prefill",
2029
+ default=ServerArgs.nsa_prefill,
2030
+ type=str,
2031
+ choices=NSA_CHOICES,
2032
+ )
2033
+ parser.add_argument(
2034
+ "--nsa-decode",
2035
+ default=ServerArgs.nsa_decode,
2036
+ type=str,
2037
+ choices=NSA_CHOICES,
2038
+ )
1466
2039
 
1467
2040
  # Speculative decoding
1468
2041
  parser.add_argument(
1469
2042
  "--speculative-algorithm",
1470
2043
  type=str,
1471
- choices=["EAGLE", "EAGLE3", "NEXTN"],
2044
+ choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE", "NGRAM"],
1472
2045
  help="Speculative algorithm.",
1473
2046
  )
1474
2047
  parser.add_argument(
1475
2048
  "--speculative-draft-model-path",
2049
+ "--speculative-draft-model",
1476
2050
  type=str,
1477
2051
  help="The path of the draft model weights. This can be a local folder or a Hugging Face repo ID.",
1478
2052
  )
2053
+ parser.add_argument(
2054
+ "--speculative-draft-model-revision",
2055
+ type=str,
2056
+ default=None,
2057
+ help="The specific draft model version to use. It can be a branch "
2058
+ "name, a tag name, or a commit id. If unspecified, will use "
2059
+ "the default version.",
2060
+ )
1479
2061
  parser.add_argument(
1480
2062
  "--speculative-num-steps",
1481
2063
  type=int,
@@ -1512,6 +2094,57 @@ class ServerArgs:
1512
2094
  help="The path of the draft model's small vocab table.",
1513
2095
  default=ServerArgs.speculative_token_map,
1514
2096
  )
2097
+ parser.add_argument(
2098
+ "--speculative-attention-mode",
2099
+ type=str,
2100
+ choices=["prefill", "decode"],
2101
+ help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
2102
+ default=ServerArgs.speculative_attention_mode,
2103
+ )
2104
+ # Ngram speculative decoding
2105
+ parser.add_argument(
2106
+ "--speculative-ngram-min-match-window-size",
2107
+ type=int,
2108
+ default=ServerArgs.speculative_ngram_min_match_window_size,
2109
+ help="The minimum window size for pattern matching in ngram speculative decoding.",
2110
+ )
2111
+ parser.add_argument(
2112
+ "--speculative-ngram-max-match-window-size",
2113
+ type=int,
2114
+ default=ServerArgs.speculative_ngram_max_match_window_size,
2115
+ help="The maximum window size for pattern matching in ngram speculative decoding.",
2116
+ )
2117
+ parser.add_argument(
2118
+ "--speculative-ngram-min-bfs-breadth",
2119
+ type=int,
2120
+ default=ServerArgs.speculative_ngram_min_bfs_breadth,
2121
+ help="The minimum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
2122
+ )
2123
+ parser.add_argument(
2124
+ "--speculative-ngram-max-bfs-breadth",
2125
+ type=int,
2126
+ default=ServerArgs.speculative_ngram_max_bfs_breadth,
2127
+ help="The maximum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
2128
+ )
2129
+ parser.add_argument(
2130
+ "--speculative-ngram-match-type",
2131
+ type=str,
2132
+ choices=["BFS", "PROB"],
2133
+ default=ServerArgs.speculative_ngram_match_type,
2134
+ help="The match type for cache tree.",
2135
+ )
2136
+ parser.add_argument(
2137
+ "--speculative-ngram-branch-length",
2138
+ type=int,
2139
+ default=ServerArgs.speculative_ngram_branch_length,
2140
+ help="The branch length for ngram speculative decoding.",
2141
+ )
2142
+ parser.add_argument(
2143
+ "--speculative-ngram-capacity",
2144
+ type=int,
2145
+ default=ServerArgs.speculative_ngram_capacity,
2146
+ help="The cache capacity for ngram speculative decoding.",
2147
+ )
1515
2148
 
1516
2149
  # Expert parallelism
1517
2150
  parser.add_argument(
@@ -1539,6 +2172,7 @@ class ServerArgs:
1539
2172
  "flashinfer_trtllm",
1540
2173
  "flashinfer_cutlass",
1541
2174
  "flashinfer_mxfp4",
2175
+ "flashinfer_cutedsl",
1542
2176
  ],
1543
2177
  default=ServerArgs.moe_runner_backend,
1544
2178
  help="Choose the runner backend for MoE.",
@@ -1546,7 +2180,7 @@ class ServerArgs:
1546
2180
  parser.add_argument(
1547
2181
  "--flashinfer-mxfp4-moe-precision",
1548
2182
  type=str,
1549
- choices=["mxfp4", "bf16"],
2183
+ choices=["default", "bf16"],
1550
2184
  default=ServerArgs.flashinfer_mxfp4_moe_precision,
1551
2185
  help="Choose the computation precision of flashinfer mxfp4 moe",
1552
2186
  )
@@ -1639,6 +2273,21 @@ class ServerArgs:
1639
2273
  help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
1640
2274
  )
1641
2275
 
2276
+ # Mamba Cache
2277
+ parser.add_argument(
2278
+ "--max-mamba-cache-size",
2279
+ type=int,
2280
+ default=ServerArgs.max_mamba_cache_size,
2281
+ help="The maximum size of the mamba cache.",
2282
+ )
2283
+ parser.add_argument(
2284
+ "--mamba-ssm-dtype",
2285
+ type=str,
2286
+ default=ServerArgs.mamba_ssm_dtype,
2287
+ choices=["float32", "bfloat16"],
2288
+ help="The data type of the SSM states in mamba cache.",
2289
+ )
2290
+
1642
2291
  # Hierarchical cache
1643
2292
  parser.add_argument(
1644
2293
  "--enable-hierarchical-cache",
@@ -1664,6 +2313,13 @@ class ServerArgs:
1664
2313
  default=ServerArgs.hicache_write_policy,
1665
2314
  help="The write policy of hierarchical cache.",
1666
2315
  )
2316
+ parser.add_argument(
2317
+ "--radix-eviction-policy",
2318
+ type=str,
2319
+ choices=RADIX_EVICTION_POLICY_CHOICES,
2320
+ default=ServerArgs.radix_eviction_policy,
2321
+ help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
2322
+ )
1667
2323
  parser.add_argument(
1668
2324
  "--hicache-io-backend",
1669
2325
  type=str,
@@ -1674,16 +2330,19 @@ class ServerArgs:
1674
2330
  parser.add_argument(
1675
2331
  "--hicache-mem-layout",
1676
2332
  type=str,
1677
- choices=["layer_first", "page_first"],
2333
+ choices=["layer_first", "page_first", "page_first_direct"],
1678
2334
  default=ServerArgs.hicache_mem_layout,
1679
2335
  help="The layout of host memory pool for hierarchical cache.",
1680
2336
  )
1681
2337
  parser.add_argument(
1682
2338
  "--hicache-storage-backend",
1683
2339
  type=str,
1684
- choices=["file", "mooncake", "hf3fs", "nixl"],
2340
+ choices=["file", "mooncake", "hf3fs", "nixl", "aibrix", "dynamic", "eic"],
1685
2341
  default=ServerArgs.hicache_storage_backend,
1686
- help="The storage backend for hierarchical KV cache.",
2342
+ help="The storage backend for hierarchical KV cache. "
2343
+ "Built-in backends: file, mooncake, hf3fs, nixl, aibrix. "
2344
+ "For dynamic backend, use --hicache-storage-backend-extra-config to specify: "
2345
+ "backend_name (custom name), module_path (Python module path), class_name (backend class name).",
1687
2346
  )
1688
2347
  parser.add_argument(
1689
2348
  "--hicache-storage-prefetch-policy",
@@ -1698,6 +2357,12 @@ class ServerArgs:
1698
2357
  default=ServerArgs.hicache_storage_backend_extra_config,
1699
2358
  help="A dictionary in JSON string format containing extra configuration for the storage backend.",
1700
2359
  )
2360
+ # LMCache
2361
+ parser.add_argument(
2362
+ "--enable-lmcache",
2363
+ action="store_true",
2364
+ help="Using LMCache as an alternative hierarchical cache solution",
2365
+ )
1701
2366
 
1702
2367
  # Double Sparsity
1703
2368
  parser.add_argument(
@@ -1841,6 +2506,11 @@ class ServerArgs:
1841
2506
  action="store_true",
1842
2507
  help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
1843
2508
  )
2509
+ parser.add_argument(
2510
+ "--enable-torch-symm-mem",
2511
+ action="store_true",
2512
+ help="Enable using torch symm mem for all-reduce kernel and fall back to NCCL. Only supports CUDA device SM90 and above. SM90 supports world size 4, 6, 8. SM10 supports world size 6, 8.",
2513
+ )
1844
2514
  parser.add_argument(
1845
2515
  "--disable-overlap-schedule",
1846
2516
  action="store_true",
@@ -1866,6 +2536,11 @@ class ServerArgs:
1866
2536
  action="store_true",
1867
2537
  help="Enabling two micro batches to overlap.",
1868
2538
  )
2539
+ parser.add_argument(
2540
+ "--enable-single-batch-overlap",
2541
+ action="store_true",
2542
+ help="Let computation and communication overlap within one micro batch.",
2543
+ )
1869
2544
  parser.add_argument(
1870
2545
  "--tbo-token-distribution-threshold",
1871
2546
  type=float,
@@ -1911,6 +2586,12 @@ class ServerArgs:
1911
2586
  default=ServerArgs.triton_attention_num_kv_splits,
1912
2587
  help="The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8.",
1913
2588
  )
2589
+ parser.add_argument(
2590
+ "--triton-attention-split-tile-size",
2591
+ type=int,
2592
+ default=ServerArgs.triton_attention_split_tile_size,
2593
+ help="The size of split KV tile in flash decoding Triton kernel. Used for deterministic inference.",
2594
+ )
1914
2595
  parser.add_argument(
1915
2596
  "--num-continuous-decode-steps",
1916
2597
  type=int,
@@ -1929,6 +2610,11 @@ class ServerArgs:
1929
2610
  action="store_true",
1930
2611
  help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
1931
2612
  )
2613
+ parser.add_argument(
2614
+ "--enable-weights-cpu-backup",
2615
+ action="store_true",
2616
+ help="Save model weights to CPU memory during release_weights_occupation and resume_weights_occupation",
2617
+ )
1932
2618
  parser.add_argument(
1933
2619
  "--allow-auto-truncate",
1934
2620
  action="store_true",
@@ -1959,6 +2645,11 @@ class ServerArgs:
1959
2645
  action="store_true",
1960
2646
  help="Adopt base image processor instead of fast image processor.",
1961
2647
  )
2648
+ parser.add_argument(
2649
+ "--keep-mm-feature-on-device",
2650
+ action="store_true",
2651
+ help="Keep multimodal feature tensors on device after processing to save D2H copy.",
2652
+ )
1962
2653
  parser.add_argument(
1963
2654
  "--enable-return-hidden-states",
1964
2655
  action="store_true",
@@ -1970,6 +2661,12 @@ class ServerArgs:
1970
2661
  default=ServerArgs.scheduler_recv_interval,
1971
2662
  help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
1972
2663
  )
2664
+ parser.add_argument(
2665
+ "--numa-node",
2666
+ type=int,
2667
+ nargs="+",
2668
+ help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
2669
+ )
1973
2670
 
1974
2671
  # Debug tensor dumps
1975
2672
  parser.add_argument(
@@ -1995,12 +2692,29 @@ class ServerArgs:
1995
2692
  action="store_true",
1996
2693
  help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
1997
2694
  )
2695
+ parser.add_argument(
2696
+ "--enable-dynamic-batch-tokenizer",
2697
+ action="store_true",
2698
+ help="Enable async dynamic batch tokenizer for improved performance when multiple requests arrive concurrently.",
2699
+ )
2700
+ parser.add_argument(
2701
+ "--dynamic-batch-tokenizer-batch-size",
2702
+ type=int,
2703
+ default=ServerArgs.dynamic_batch_tokenizer_batch_size,
2704
+ help="[Only used if --enable-dynamic-batch-tokenizer is set] Maximum batch size for dynamic batch tokenizer.",
2705
+ )
2706
+ parser.add_argument(
2707
+ "--dynamic-batch-tokenizer-batch-timeout",
2708
+ type=float,
2709
+ default=ServerArgs.dynamic_batch_tokenizer_batch_timeout,
2710
+ help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
2711
+ )
1998
2712
 
1999
2713
  # PD disaggregation
2000
2714
  parser.add_argument(
2001
2715
  "--disaggregation-mode",
2002
2716
  type=str,
2003
- default="null",
2717
+ default=ServerArgs.disaggregation_mode,
2004
2718
  choices=["null", "prefill", "decode"],
2005
2719
  help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
2006
2720
  )
@@ -2043,6 +2757,11 @@ class ServerArgs:
2043
2757
  "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
2044
2758
  "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
2045
2759
  )
2760
+ parser.add_argument(
2761
+ "--disaggregation-decode-enable-offload-kvcache",
2762
+ action="store_true",
2763
+ help="Enable async KV cache offloading on decode server (PD mode).",
2764
+ )
2046
2765
  parser.add_argument(
2047
2766
  "--num-reserved-decode-tokens",
2048
2767
  type=int,
@@ -2050,10 +2769,10 @@ class ServerArgs:
2050
2769
  help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
2051
2770
  )
2052
2771
  parser.add_argument(
2053
- "--pdlb-url",
2054
- type=str,
2055
- default=None,
2056
- help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
2772
+ "--disaggregation-decode-polling-interval",
2773
+ type=int,
2774
+ default=ServerArgs.disaggregation_decode_polling_interval,
2775
+ help="The interval to poll requests in decode server. Can be set to >1 to reduce the overhead of this.",
2057
2776
  )
2058
2777
 
2059
2778
  # Custom weight loader
@@ -2069,6 +2788,24 @@ class ServerArgs:
2069
2788
  action="store_true",
2070
2789
  help="Disable mmap while loading weight using safetensors.",
2071
2790
  )
2791
+ parser.add_argument(
2792
+ "--remote-instance-weight-loader-seed-instance-ip",
2793
+ type=str,
2794
+ default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
2795
+ help="The ip of the seed instance for loading weights from remote instance.",
2796
+ )
2797
+ parser.add_argument(
2798
+ "--remote-instance-weight-loader-seed-instance-service-port",
2799
+ type=int,
2800
+ default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
2801
+ help="The service port of the seed instance for loading weights from remote instance.",
2802
+ )
2803
+ parser.add_argument(
2804
+ "--remote-instance-weight-loader-send-weights-group-ports",
2805
+ type=json_list_type,
2806
+ default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
2807
+ help="The communication group ports for loading weights from remote instance.",
2808
+ )
2072
2809
 
2073
2810
  # For PD-Multiplexing
2074
2811
  parser.add_argument(
@@ -2084,36 +2821,55 @@ class ServerArgs:
2084
2821
  help="Number of sm partition groups.",
2085
2822
  )
2086
2823
 
2824
+ # For deterministic inference
2825
+ parser.add_argument(
2826
+ "--enable-deterministic-inference",
2827
+ action="store_true",
2828
+ help="Enable deterministic inference mode with batch invariant ops.",
2829
+ )
2830
+
2087
2831
  # Deprecated arguments
2088
2832
  parser.add_argument(
2089
2833
  "--enable-ep-moe",
2090
- action="store_true",
2091
- help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
2834
+ action=DeprecatedAction,
2835
+ help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
2092
2836
  )
2093
2837
  parser.add_argument(
2094
2838
  "--enable-deepep-moe",
2095
- action="store_true",
2096
- help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
2839
+ action=DeprecatedAction,
2840
+ help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
2097
2841
  )
2098
2842
  parser.add_argument(
2099
2843
  "--enable-flashinfer-cutlass-moe",
2100
- action="store_true",
2101
- help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
2844
+ action=DeprecatedAction,
2845
+ help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
2846
+ )
2847
+ parser.add_argument(
2848
+ "--enable-flashinfer-cutedsl-moe",
2849
+ action=DeprecatedAction,
2850
+ help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
2102
2851
  )
2103
2852
  parser.add_argument(
2104
2853
  "--enable-flashinfer-trtllm-moe",
2105
- action="store_true",
2106
- help="(Deprecated) Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
2854
+ action=DeprecatedAction,
2855
+ help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
2107
2856
  )
2108
2857
  parser.add_argument(
2109
2858
  "--enable-triton-kernel-moe",
2110
- action="store_true",
2111
- help="(Deprecated) Use triton moe grouped gemm kernel.",
2859
+ action=DeprecatedAction,
2860
+ help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
2112
2861
  )
2113
2862
  parser.add_argument(
2114
2863
  "--enable-flashinfer-mxfp4-moe",
2115
- action="store_true",
2116
- help="(Deprecated) Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
2864
+ action=DeprecatedAction,
2865
+ help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
2866
+ )
2867
+
2868
+ # Configuration file support
2869
+ parser.add_argument(
2870
+ "--config",
2871
+ type=str,
2872
+ help="Read CLI options from a config file. Must be a YAML file with configuration options.",
2117
2873
  )
2118
2874
 
2119
2875
  @classmethod
@@ -2122,6 +2878,7 @@ class ServerArgs:
2122
2878
  args.pp_size = args.pipeline_parallel_size
2123
2879
  args.dp_size = args.data_parallel_size
2124
2880
  args.ep_size = args.expert_parallel_size
2881
+
2125
2882
  attrs = [attr.name for attr in dataclasses.fields(cls)]
2126
2883
  return cls(**{attr: getattr(args, attr) for attr in attrs})
2127
2884
 
@@ -2178,13 +2935,27 @@ class ServerArgs:
2178
2935
 
2179
2936
  # Check chunked prefill
2180
2937
  # Skip validation if chunked prefill is disabled (i.e., size <= 0).
2181
- if self.chunked_prefill_size > 0:
2938
+ # Skip validation if disaggregation mode is decode.
2939
+ if self.chunked_prefill_size > 0 and self.disaggregation_mode != "decode":
2182
2940
  assert (
2183
2941
  self.chunked_prefill_size % self.page_size == 0
2184
2942
  ), "chunked_prefill_size must be divisible by page_size"
2185
2943
 
2186
2944
  # Check multi tokenizer
2187
2945
  assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
2946
+ self.validate_buckets_rule(
2947
+ "--prompt-tokens-buckets", self.prompt_tokens_buckets
2948
+ )
2949
+ self.validate_buckets_rule(
2950
+ "--generation-tokens-buckets", self.generation_tokens_buckets
2951
+ )
2952
+
2953
+ # Check scheduling policy
2954
+ if self.enable_priority_scheduling:
2955
+ assert self.schedule_policy in [
2956
+ "fcfs",
2957
+ "lof",
2958
+ ], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
2188
2959
 
2189
2960
  def check_lora_server_args(self):
2190
2961
  assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
@@ -2269,6 +3040,12 @@ class ServerArgs:
2269
3040
  f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
2270
3041
  )
2271
3042
 
3043
+ if self.max_lora_chunk_size is not None:
3044
+ assert (
3045
+ 16 <= self.max_lora_chunk_size <= 128
3046
+ and (self.max_lora_chunk_size & (self.max_lora_chunk_size - 1)) == 0
3047
+ ), "--max-lora-chunk-size must be a power of 2 between 16 and 128."
3048
+
2272
3049
  def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
2273
3050
  larger_tp = max(decode_tp, prefill_tp)
2274
3051
  smaller_tp = min(decode_tp, prefill_tp)
@@ -2277,79 +3054,53 @@ class ServerArgs:
2277
3054
  f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
2278
3055
  )
2279
3056
 
2280
- def model_specific_adjustments(self):
2281
- hf_config = self.get_hf_config()
2282
- model_arch = hf_config.architectures[0]
2283
- if model_arch in ["GptOssForCausalLM"]:
2284
- if self.attention_backend is None:
2285
- if is_cuda() and is_sm100_supported():
2286
- self.attention_backend = "trtllm_mha"
2287
- elif is_cuda() and is_sm90_supported():
2288
- self.attention_backend = "fa3"
2289
- else:
2290
- self.attention_backend = "triton"
2291
- supported_backends = ["triton", "trtllm_mha", "fa3"]
2292
- logger.info(
2293
- f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
2294
- )
2295
- assert (
2296
- self.attention_backend in supported_backends
2297
- ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
3057
+ def validate_buckets_rule(self, arg_name: str, buckets_rule: List[str]):
3058
+ if not buckets_rule:
3059
+ return
2298
3060
 
2299
- if is_sm100_supported():
2300
- if not self.enable_dp_attention:
2301
- self.enable_flashinfer_allreduce_fusion = True
2302
- logger.info(
2303
- "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
2304
- )
2305
- quantization_config = getattr(hf_config, "quantization_config", None)
2306
- is_mxfp4_quant_format = (
2307
- quantization_config is not None
2308
- and quantization_config.get("quant_method") == "mxfp4"
2309
- )
3061
+ assert len(buckets_rule) > 0, f"{arg_name} cannot be empty list"
3062
+ rule = buckets_rule[0]
3063
+ assert rule in [
3064
+ "tse",
3065
+ "default",
3066
+ "custom",
3067
+ ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'custom'"
2310
3068
 
2311
- if is_sm100_supported() and is_mxfp4_quant_format:
2312
- self.moe_runner_backend = "flashinfer_mxfp4"
2313
- logger.warning(
2314
- "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
2315
- )
2316
- else:
2317
- if self.moe_runner_backend == "triton_kernel":
2318
- assert (
2319
- self.ep_size == 1
2320
- ), "Triton kernel MoE is only supported when ep_size == 1"
2321
- if (
2322
- self.moe_runner_backend == "auto"
2323
- and self.ep_size == 1
2324
- and is_triton_kernels_available()
2325
- ):
2326
- self.moe_runner_backend = "triton_kernel"
2327
- logger.warning(
2328
- "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
2329
- )
2330
- self.disable_hybrid_swa_memory = True
2331
- if is_mxfp4_quant_format:
2332
- # use bf16 for mxfp4 triton kernels
2333
- self.dtype = "bfloat16"
3069
+ if rule == "tse":
3070
+ assert (
3071
+ len(buckets_rule) == 4
3072
+ ), f"{arg_name} TSE rule requires exactly 4 parameters: ['tse', middle, base, count], got {len(buckets_rule)}"
3073
+ try:
3074
+ middle = float(buckets_rule[1])
3075
+ base = float(buckets_rule[2])
3076
+ count = int(buckets_rule[3])
3077
+ except (ValueError, IndexError):
3078
+ assert (
3079
+ False
3080
+ ), f"{arg_name} TSE rule parameters must be: ['tse', <float:middle>, <float:base>, <int:count>]"
3081
+ assert base > 1, f"{arg_name} TSE base must be larger than 1, got: {base}"
3082
+ assert count > 0, f"{arg_name} TSE count must be positive, got: {count}"
3083
+ assert middle > 0, f"{arg_name} TSE middle must be positive, got: {middle}"
2334
3084
 
2335
- elif "Llama4" in model_arch:
2336
- assert self.attention_backend in {
2337
- "fa3",
2338
- "aiter",
2339
- }, "fa3 or aiter is required for Llama4 model"
2340
- elif model_arch in [
2341
- "Gemma2ForCausalLM",
2342
- "Gemma3ForCausalLM",
2343
- "Gemma3ForConditionalGeneration",
2344
- "Gemma3nForCausalLM",
2345
- "Gemma3nForConditionalGeneration",
2346
- ]:
2347
- # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
2348
- # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
2349
- logger.warning(
2350
- f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
2351
- )
2352
- self.disable_hybrid_swa_memory = True
3085
+ elif rule == "default":
3086
+ assert (
3087
+ len(buckets_rule) == 1
3088
+ ), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
3089
+
3090
+ elif rule == "custom":
3091
+ assert (
3092
+ len(buckets_rule) >= 2
3093
+ ), f"{arg_name} custom rule requires at least one bucket value: ['custom', value1, ...]"
3094
+ try:
3095
+ bucket_values = [float(x) for x in buckets_rule[1:]]
3096
+ except ValueError:
3097
+ assert False, f"{arg_name} custom rule bucket values must be numeric"
3098
+ assert len(set(bucket_values)) == len(
3099
+ bucket_values
3100
+ ), f"{arg_name} custom rule bucket values should not contain duplicates"
3101
+ assert all(
3102
+ val >= 0 for val in bucket_values
3103
+ ), f"{arg_name} custom rule bucket values should be non-negative"
2353
3104
 
2354
3105
  def adjust_mem_fraction_for_vlm(self, model_config):
2355
3106
  vision_config = getattr(model_config.hf_config, "vision_config", None)
@@ -2401,6 +3152,26 @@ def prepare_server_args(argv: List[str]) -> ServerArgs:
2401
3152
  Returns:
2402
3153
  The server arguments.
2403
3154
  """
3155
+ # Import here to avoid circular imports
3156
+ from sglang.srt.server_args_config_parser import ConfigArgumentMerger
3157
+
3158
+ # Check for config file and merge arguments if present
3159
+ if "--config" in argv:
3160
+ # Extract boolean actions from the parser to handle them correctly
3161
+ parser = argparse.ArgumentParser()
3162
+ ServerArgs.add_cli_args(parser)
3163
+
3164
+ # Get boolean action destinations
3165
+ boolean_actions = []
3166
+ for action in parser._actions:
3167
+ if hasattr(action, "dest") and hasattr(action, "action"):
3168
+ if action.action in ["store_true", "store_false"]:
3169
+ boolean_actions.append(action.dest)
3170
+
3171
+ # Merge config file arguments with CLI arguments
3172
+ config_merger = ConfigArgumentMerger(boolean_actions=boolean_actions)
3173
+ argv = config_merger.merge_config_with_args(argv)
3174
+
2404
3175
  parser = argparse.ArgumentParser()
2405
3176
  ServerArgs.add_cli_args(parser)
2406
3177
  raw_args = parser.parse_args(argv)
@@ -2535,14 +3306,19 @@ def auto_choose_speculative_params(self: ServerArgs):
2535
3306
  """
2536
3307
  hf_config = self.get_hf_config()
2537
3308
  arch = hf_config.architectures[0]
2538
-
3309
+ if self.speculative_algorithm == "STANDALONE":
3310
+ # The default value for standalone speculative decoding
3311
+ return (3, 1, 4)
2539
3312
  if arch in ["LlamaForCausalLM"]:
2540
3313
  # The default value for llama
2541
3314
  return (5, 4, 8)
2542
3315
  elif arch in [
3316
+ "DeepseekV32ForCausalLM",
2543
3317
  "DeepseekV3ForCausalLM",
2544
3318
  "DeepseekV2ForCausalLM",
2545
3319
  "GptOssForCausalLM",
3320
+ "BailingMoeForCausalLM",
3321
+ "BailingMoeV2ForCausalLM",
2546
3322
  ]:
2547
3323
  # The default value for deepseek and gpt-oss
2548
3324
  return (3, 1, 4)