sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -13,20 +13,21 @@
13
13
  # ==============================================================================
14
14
  """The arguments of the server."""
15
15
 
16
+ from __future__ import annotations
17
+
16
18
  import argparse
17
19
  import dataclasses
18
20
  import json
19
21
  import logging
20
22
  import os
21
23
  import random
22
- import socket
23
- import sys
24
24
  import tempfile
25
- from typing import List, Literal, Optional, Union
25
+ from typing import Dict, List, Literal, Optional, Union
26
+
27
+ import orjson
26
28
 
27
29
  from sglang.srt.connector import ConnectorType
28
30
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
29
- from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
30
31
  from sglang.srt.lora.lora_registry import LoRARef
31
32
  from sglang.srt.parser.reasoning_parser import ReasoningParser
32
33
  from sglang.srt.utils import (
@@ -35,6 +36,7 @@ from sglang.srt.utils import (
35
36
  configure_ipv6,
36
37
  get_device,
37
38
  get_device_memory_capacity,
39
+ get_device_sm,
38
40
  is_cuda,
39
41
  is_flashinfer_available,
40
42
  is_hip,
@@ -43,12 +45,14 @@ from sglang.srt.utils import (
43
45
  is_remote_url,
44
46
  is_sm90_supported,
45
47
  is_sm100_supported,
48
+ is_sm120_supported,
46
49
  is_triton_kernels_available,
47
50
  is_valid_ipv6_address,
48
51
  json_list_type,
49
52
  nullable_str,
50
53
  parse_connector_type,
51
54
  )
55
+ from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
52
56
  from sglang.utils import is_in_ci
53
57
 
54
58
  logger = logging.getLogger(__name__)
@@ -79,6 +83,7 @@ QUANTIZATION_CHOICES = [
79
83
  "bitsandbytes",
80
84
  "gguf",
81
85
  "modelopt",
86
+ "modelopt_fp8",
82
87
  "modelopt_fp4",
83
88
  "petit_nvfp4",
84
89
  "w8a8_int8",
@@ -87,33 +92,59 @@ QUANTIZATION_CHOICES = [
87
92
  "qoq",
88
93
  "w4afp8",
89
94
  "mxfp4",
95
+ "compressed-tensors", # for Ktransformers
90
96
  ]
91
97
 
92
98
  ATTENTION_BACKEND_CHOICES = [
93
99
  # Common
94
100
  "triton",
95
101
  "torch_native",
102
+ "flex_attention",
103
+ "nsa",
96
104
  # NVIDIA specific
97
105
  "cutlass_mla",
98
106
  "fa3",
107
+ "fa4",
99
108
  "flashinfer",
100
109
  "flashmla",
101
110
  "trtllm_mla",
102
111
  "trtllm_mha",
103
112
  "dual_chunk_flash_attn",
104
- "hybrid_linear_attn",
105
113
  # AMD specific
106
114
  "aiter",
107
115
  "wave",
108
116
  # Other platforms
109
117
  "intel_amx",
110
118
  "ascend",
119
+ "intel_xpu",
111
120
  ]
112
121
 
122
+ LORA_BACKEND_CHOICES = ["triton", "csgmv"]
123
+
113
124
  DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
114
125
 
115
126
  GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
116
127
 
128
+ DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
129
+
130
+ DEFAULT_LORA_EVICTION_POLICY = "lru"
131
+
132
+ NSA_CHOICES = ["flashmla_sparse", "flashmla_kv", "fa3", "tilelang", "aiter"]
133
+
134
+ RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
135
+
136
+ MOE_RUNNER_BACKEND_CHOICES = [
137
+ "auto",
138
+ "deep_gemm",
139
+ "triton",
140
+ "triton_kernel",
141
+ "flashinfer_trtllm",
142
+ "flashinfer_cutlass",
143
+ "flashinfer_mxfp4",
144
+ "flashinfer_cutedsl",
145
+ "cutlass",
146
+ ]
147
+
117
148
 
118
149
  # Allow external code to add more choices
119
150
  def add_load_format_choices(choices):
@@ -136,6 +167,18 @@ def add_grammar_backend_choices(choices):
136
167
  GRAMMAR_BACKEND_CHOICES.extend(choices)
137
168
 
138
169
 
170
+ def add_moe_runner_backend_choices(choices):
171
+ MOE_RUNNER_BACKEND_CHOICES.extend(choices)
172
+
173
+
174
+ def add_deterministic_attention_backend_choices(choices):
175
+ DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
176
+
177
+
178
+ def add_radix_eviction_policy_choices(choices):
179
+ RADIX_EVICTION_POLICY_CHOICES.extend(choices)
180
+
181
+
139
182
  @dataclasses.dataclass
140
183
  class ServerArgs:
141
184
  # Model and tokenizer
@@ -147,6 +190,11 @@ class ServerArgs:
147
190
  load_format: str = "auto"
148
191
  model_loader_extra_config: str = "{}"
149
192
  trust_remote_code: bool = False
193
+ modelopt_quant: Optional[Union[str, Dict]] = None
194
+ modelopt_checkpoint_restore_path: Optional[str] = None
195
+ modelopt_checkpoint_save_path: Optional[str] = None
196
+ modelopt_export_path: Optional[str] = None
197
+ quantize_and_serve: bool = False
150
198
  context_length: Optional[int] = None
151
199
  is_embedding: bool = False
152
200
  enable_multimodal: Optional[bool] = None
@@ -156,39 +204,50 @@ class ServerArgs:
156
204
  # HTTP server
157
205
  host: str = "127.0.0.1"
158
206
  port: int = 30000
207
+ grpc_mode: bool = False
159
208
  skip_server_warmup: bool = False
160
209
  warmups: Optional[str] = None
161
210
  nccl_port: Optional[int] = None
211
+ checkpoint_engine_wait_weights_before_ready: bool = False
162
212
 
163
213
  # Quantization and data type
164
214
  dtype: str = "auto"
165
215
  quantization: Optional[str] = None
166
216
  quantization_param_path: Optional[str] = None
167
217
  kv_cache_dtype: str = "auto"
218
+ enable_fp32_lm_head: bool = False
168
219
 
169
220
  # Memory and scheduling
170
221
  mem_fraction_static: Optional[float] = None
171
222
  max_running_requests: Optional[int] = None
172
- max_queued_requests: Optional[int] = sys.maxsize
223
+ max_queued_requests: Optional[int] = None
173
224
  max_total_tokens: Optional[int] = None
174
225
  chunked_prefill_size: Optional[int] = None
175
226
  max_prefill_tokens: int = 16384
176
227
  schedule_policy: str = "fcfs"
228
+ enable_priority_scheduling: bool = False
229
+ abort_on_priority_when_disabled: bool = False
230
+ schedule_low_priority_values_first: bool = False
231
+ priority_scheduling_preemption_threshold: int = 10
177
232
  schedule_conservativeness: float = 1.0
178
233
  page_size: Optional[int] = None
179
234
  hybrid_kvcache_ratio: Optional[float] = None
180
235
  swa_full_tokens_ratio: float = 0.8
181
236
  disable_hybrid_swa_memory: bool = False
237
+ radix_eviction_policy: str = "lru"
182
238
 
183
239
  # Runtime options
184
240
  device: Optional[str] = None
241
+ elastic_ep_backend: Literal[None, "mooncake"] = None
242
+ mooncake_ib_device: Optional[str] = None
185
243
  tp_size: int = 1
186
244
  pp_size: int = 1
187
- max_micro_batch_size: Optional[int] = None
245
+ pp_max_micro_batch_size: Optional[int] = None
188
246
  stream_interval: int = 1
189
247
  stream_output: bool = False
190
248
  random_seed: Optional[int] = None
191
249
  constrained_json_whitespace_pattern: Optional[str] = None
250
+ constrained_json_disable_any_whitespace: bool = False
192
251
  watchdog_timeout: float = 300
193
252
  dist_timeout: Optional[int] = None # timeout for torch.distributed
194
253
  download_dir: Optional[str] = None
@@ -205,8 +264,8 @@ class ServerArgs:
205
264
  show_time_cost: bool = False
206
265
  enable_metrics: bool = False
207
266
  enable_metrics_for_all_schedulers: bool = False
208
- tokenizer_metrics_custom_labels_header: str = "x-customer-labels"
209
- tokenizer_metrics_allowed_customer_labels: Optional[List[str]] = None
267
+ tokenizer_metrics_custom_labels_header: str = "x-custom-labels"
268
+ tokenizer_metrics_allowed_custom_labels: Optional[List[str]] = None
210
269
  bucket_time_to_first_token: Optional[List[float]] = None
211
270
  bucket_inter_token_latency: Optional[List[float]] = None
212
271
  bucket_e2e_request_latency: Optional[List[float]] = None
@@ -231,6 +290,7 @@ class ServerArgs:
231
290
  reasoning_parser: Optional[str] = None
232
291
  tool_call_parser: Optional[str] = None
233
292
  tool_server: Optional[str] = None
293
+ sampling_defaults: str = "model"
234
294
 
235
295
  # Data parallelism
236
296
  dp_size: int = 1
@@ -257,7 +317,9 @@ class ServerArgs:
257
317
  ] = None
258
318
  max_loaded_loras: Optional[int] = None
259
319
  max_loras_per_batch: int = 8
320
+ lora_eviction_policy: str = DEFAULT_LORA_EVICTION_POLICY
260
321
  lora_backend: str = "triton"
322
+ max_lora_chunk_size: Optional[int] = 16
261
323
 
262
324
  # Kernel backend
263
325
  attention_backend: Optional[str] = None
@@ -266,11 +328,15 @@ class ServerArgs:
266
328
  sampling_backend: Optional[str] = None
267
329
  grammar_backend: Optional[str] = None
268
330
  mm_attention_backend: Optional[str] = None
331
+ nsa_prefill_backend: str = "flashmla_sparse"
332
+ nsa_decode_backend: str = "fa3"
269
333
 
270
334
  # Speculative decoding
335
+ enable_beta_spec: bool = False
271
336
  speculative_algorithm: Optional[str] = None
272
337
  speculative_draft_model_path: Optional[str] = None
273
338
  speculative_draft_model_revision: Optional[str] = None
339
+ speculative_draft_load_format: Optional[str] = None
274
340
  speculative_num_steps: Optional[int] = None
275
341
  speculative_eagle_topk: Optional[int] = None
276
342
  speculative_num_draft_tokens: Optional[int] = None
@@ -278,18 +344,19 @@ class ServerArgs:
278
344
  speculative_accept_threshold_acc: float = 1.0
279
345
  speculative_token_map: Optional[str] = None
280
346
  speculative_attention_mode: str = "prefill"
347
+ # For ngram only
348
+ speculative_ngram_min_match_window_size: int = 1
349
+ speculative_ngram_max_match_window_size: int = 12
350
+ speculative_ngram_min_bfs_breadth: int = 1
351
+ speculative_ngram_max_bfs_breadth: int = 10
352
+ speculative_ngram_match_type: Literal["BFS", "PROB"] = "BFS"
353
+ speculative_ngram_branch_length: int = 18
354
+ speculative_ngram_capacity: int = 10 * 1000 * 1000
281
355
 
282
356
  # Expert parallelism
283
357
  ep_size: int = 1
284
- moe_a2a_backend: Literal["none", "deepep"] = "none"
285
- moe_runner_backend: Literal[
286
- "auto",
287
- "triton",
288
- "triton_kernel",
289
- "flashinfer_trtllm",
290
- "flashinfer_cutlass",
291
- "flashinfer_mxfp4",
292
- ] = "auto"
358
+ moe_a2a_backend: Literal["none", "deepep", "mooncake"] = "none"
359
+ moe_runner_backend: str = "auto"
293
360
  flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
294
361
  enable_flashinfer_allreduce_fusion: bool = False
295
362
  deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
@@ -309,6 +376,11 @@ class ServerArgs:
309
376
  deepep_config: Optional[str] = None
310
377
  moe_dense_tp_size: Optional[int] = None
311
378
 
379
+ # Mamba cache
380
+ max_mamba_cache_size: Optional[int] = None
381
+ mamba_ssm_dtype: str = "float32"
382
+ mamba_full_memory_ratio: float = 0.9
383
+
312
384
  # Hierarchical cache
313
385
  enable_hierarchical_cache: bool = False
314
386
  hicache_ratio: float = 2.0
@@ -322,6 +394,13 @@ class ServerArgs:
322
394
  # LMCache
323
395
  enable_lmcache: bool = False
324
396
 
397
+ # Ktransformers
398
+ kt_amx_weight_path: Optional[str] = None
399
+ kt_amx_method: Optional[str] = None
400
+ kt_cpuinfer: Optional[int] = None
401
+ kt_threadpool_count: Optional[int] = None
402
+ kt_num_gpu_experts: Optional[int] = None
403
+
325
404
  # Double Sparsity
326
405
  enable_double_sparsity: bool = False
327
406
  ds_channel_config_path: Optional[str] = None
@@ -337,6 +416,12 @@ class ServerArgs:
337
416
  offload_prefetch_step: int = 1
338
417
  offload_mode: str = "cpu"
339
418
 
419
+ # Scoring configuration
420
+ # Delimiter token ID used to combine Query and Items into a single sequence for multi-item scoring.
421
+ # Format: Query<delimiter>Item1<delimiter>Item2<delimiter>...
422
+ # This enables efficient batch processing of multiple items against a single query.
423
+ multi_item_scoring_delimiter: Optional[Union[int]] = None
424
+
340
425
  # Optimization/debug options
341
426
  disable_radix_cache: bool = False
342
427
  cuda_graph_max_bs: Optional[int] = None
@@ -349,17 +434,24 @@ class ServerArgs:
349
434
  enable_symm_mem: bool = False
350
435
  disable_flashinfer_cutlass_moe_fp4_allgather: bool = False
351
436
  enable_tokenizer_batch_encode: bool = False
437
+ disable_tokenizer_batch_decode: bool = False
352
438
  disable_outlines_disk_cache: bool = False
353
439
  disable_custom_all_reduce: bool = False
354
440
  enable_mscclpp: bool = False
441
+ enable_torch_symm_mem: bool = False
355
442
  disable_overlap_schedule: bool = False
356
443
  enable_mixed_chunk: bool = False
357
444
  enable_dp_attention: bool = False
358
445
  enable_dp_lm_head: bool = False
359
446
  enable_two_batch_overlap: bool = False
447
+ enable_single_batch_overlap: bool = False
360
448
  tbo_token_distribution_threshold: float = 0.48
361
449
  enable_torch_compile: bool = False
450
+ enable_piecewise_cuda_graph: bool = False
362
451
  torch_compile_max_bs: int = 32
452
+ piecewise_cuda_graph_max_tokens: int = 4096
453
+ piecewise_cuda_graph_tokens: Optional[List[int]] = None
454
+ piecewise_cuda_graph_compiler: str = "eager"
363
455
  torchao_config: str = ""
364
456
  enable_nan_detection: bool = False
365
457
  enable_p2p_check: bool = False
@@ -369,15 +461,18 @@ class ServerArgs:
369
461
  num_continuous_decode_steps: int = 1
370
462
  delete_ckpt_after_loading: bool = False
371
463
  enable_memory_saver: bool = False
464
+ enable_weights_cpu_backup: bool = False
372
465
  allow_auto_truncate: bool = False
373
466
  enable_custom_logit_processor: bool = False
374
467
  flashinfer_mla_disable_ragged: bool = False
375
468
  disable_shared_experts_fusion: bool = False
376
469
  disable_chunked_prefix_cache: bool = False
377
470
  disable_fast_image_processor: bool = False
471
+ keep_mm_feature_on_device: bool = False
378
472
  enable_return_hidden_states: bool = False
379
473
  scheduler_recv_interval: int = 1
380
474
  numa_node: Optional[List[int]] = None
475
+ enable_deterministic_inference: bool = False
381
476
 
382
477
  # Dynamic batch tokenizer
383
478
  enable_dynamic_batch_tokenizer: bool = False
@@ -388,7 +483,6 @@ class ServerArgs:
388
483
  debug_tensor_dump_output_folder: Optional[str] = None
389
484
  debug_tensor_dump_input_file: Optional[str] = None
390
485
  debug_tensor_dump_inject: bool = False
391
- debug_tensor_dump_prefill_only: bool = False
392
486
 
393
487
  # PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
394
488
  disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
@@ -398,79 +492,147 @@ class ServerArgs:
398
492
  disaggregation_decode_dp: Optional[int] = None
399
493
  disaggregation_prefill_pp: Optional[int] = 1
400
494
  disaggregation_ib_device: Optional[str] = None
495
+ disaggregation_decode_enable_offload_kvcache: bool = False
401
496
  num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
402
-
403
497
  # FIXME: hack to reduce ITL when decode bs is small
404
498
  disaggregation_decode_polling_interval: int = 1
405
499
 
406
- # For model weight update
500
+ # For model weight update and weight loading
407
501
  custom_weight_loader: Optional[List[str]] = None
408
502
  weight_loader_disable_mmap: bool = False
409
-
410
- # Remote instance weight loading
411
503
  remote_instance_weight_loader_seed_instance_ip: Optional[str] = None
412
504
  remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
413
505
  remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None
414
506
 
415
507
  # For PD-Multiplexing
416
508
  enable_pdmux: bool = False
417
- sm_group_num: int = 3
418
-
419
- # Mamba cache
420
- max_mamba_cache_size: Optional[int] = None
421
- mamba_ssm_dtype: str = "float32"
509
+ pdmux_config_path: Optional[str] = None
510
+ sm_group_num: int = 8
422
511
 
423
- # Deprecated arguments
424
- enable_ep_moe: bool = False
425
- enable_deepep_moe: bool = False
426
- enable_flashinfer_cutlass_moe: bool = False
427
- enable_flashinfer_cutedsl_moe: bool = False
428
- enable_flashinfer_trtllm_moe: bool = False
429
- enable_triton_kernel_moe: bool = False
430
- enable_flashinfer_mxfp4_moe: bool = False
512
+ def get_attention_backends(server_args):
513
+ prefill_attention_backend_str = (
514
+ server_args.prefill_attention_backend
515
+ if server_args.prefill_attention_backend
516
+ else server_args.attention_backend
517
+ )
518
+ decode_attention_backend_str = (
519
+ server_args.decode_attention_backend
520
+ if server_args.decode_attention_backend
521
+ else server_args.attention_backend
522
+ )
523
+ return prefill_attention_backend_str, decode_attention_backend_str
431
524
 
432
525
  def __post_init__(self):
433
- # Check deprecated arguments
434
- if self.enable_ep_moe:
435
- self.ep_size = self.tp_size
436
- print_deprecated_warning(
437
- "NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
438
- )
439
- if self.enable_deepep_moe:
440
- self.moe_a2a_backend = "deepep"
441
- print_deprecated_warning(
442
- "NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
443
- )
444
- if self.enable_triton_kernel_moe:
445
- self.moe_runner_backend = "triton_kernel"
446
- print_deprecated_warning(
447
- "NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
448
- )
449
- if self.enable_flashinfer_cutedsl_moe:
450
- self.moe_runner_backend = "flashinfer_cutedsl"
451
- print_deprecated_warning(
452
- "NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead."
453
- )
454
- if self.enable_flashinfer_cutlass_moe:
455
- self.moe_runner_backend = "flashinfer_cutlass"
456
- print_deprecated_warning(
457
- "NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead."
458
- )
459
- if self.enable_flashinfer_trtllm_moe:
460
- self.moe_runner_backend = "flashinfer_trtllm"
461
- print_deprecated_warning(
462
- "NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead."
463
- )
464
- if self.enable_flashinfer_mxfp4_moe:
465
- self.moe_runner_backend = "flashinfer_mxfp4"
466
- print_deprecated_warning(
467
- "NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead."
526
+ """
527
+ Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
528
+ """
529
+
530
+ if self.model_path.lower() in ["none", "dummy"]:
531
+ # Skip for dummy models
532
+ return
533
+
534
+ # Handle deprecated arguments.
535
+ self._handle_deprecated_args()
536
+
537
+ # Set missing default values.
538
+ self._handle_missing_default_values()
539
+
540
+ # Get GPU memory capacity, which is a common dependency for several configuration steps.
541
+ gpu_mem = get_device_memory_capacity(self.device)
542
+
543
+ # Handle memory-related, chunked prefill, and CUDA graph batch size configurations.
544
+ self._handle_gpu_memory_settings(gpu_mem)
545
+
546
+ # Handle device-specific backends.
547
+ self._handle_hpu_backends()
548
+ self._handle_cpu_backends()
549
+
550
+ # Apply model-specific adjustments.
551
+ self._handle_model_specific_adjustments()
552
+
553
+ # Set kernel backends.
554
+ self._handle_sampling_backend()
555
+ self._handle_attention_backend_compatibility()
556
+ self._handle_page_size()
557
+ self._handle_amd_specifics()
558
+ self._handle_grammar_backend()
559
+
560
+ # Handle Ktransformers specific configs
561
+ self._handle_ktransformers_configs()
562
+
563
+ # Handle data parallelism.
564
+ self._handle_data_parallelism()
565
+
566
+ # Handle MoE configurations.
567
+ self._handle_moe_kernel_config()
568
+ self._handle_a2a_moe()
569
+ self._handle_eplb_and_dispatch()
570
+ self._handle_expert_distribution_metrics()
571
+
572
+ # Handle pipeline parallelism.
573
+ self._handle_pipeline_parallelism()
574
+
575
+ # Handle Hicache settings.
576
+ self._handle_hicache()
577
+
578
+ # Handle speculative decoding logic.
579
+ self._handle_speculative_decoding()
580
+
581
+ # Handle model loading format.
582
+ self._handle_load_format()
583
+
584
+ # Handle PD disaggregation.
585
+ self._handle_disaggregation()
586
+
587
+ # Validate tokenizer settings.
588
+ self._handle_tokenizer_batching()
589
+
590
+ # Propagate environment variables.
591
+ self._handle_environment_variables()
592
+
593
+ # Validate cache settings.
594
+ self._handle_cache_compatibility()
595
+
596
+ # Validate metrics labels.
597
+ self._handle_metrics_labels()
598
+
599
+ # Handle deterministic inference.
600
+ self._handle_deterministic_inference()
601
+
602
+ # Handle any other necessary validations.
603
+ self._handle_other_validations()
604
+
605
+ # Handle elastic expert parallelism.
606
+ self._handle_elastic_ep()
607
+
608
+ def _handle_deprecated_args(self):
609
+ # handle deprecated tool call parsers
610
+ deprecated_tool_call_parsers = {"qwen25": "qwen", "glm45": "glm"}
611
+ if self.tool_call_parser in deprecated_tool_call_parsers:
612
+ logger.warning(
613
+ f"The tool_call_parser '{self.tool_call_parser}' is deprecated. Please use '{deprecated_tool_call_parsers[self.tool_call_parser]}' instead."
468
614
  )
615
+ self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]
469
616
 
470
- # Set missing default values
617
+ def _handle_ktransformers_configs(self):
618
+ from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
619
+ CompressedTensorsWNA16AMXEPMoEMethod,
620
+ override_config,
621
+ )
622
+
623
+ override_config(
624
+ CompressedTensorsWNA16AMXEPMoEMethod,
625
+ self.kt_num_gpu_experts,
626
+ self.kt_cpuinfer,
627
+ self.kt_threadpool_count,
628
+ self.kt_amx_weight_path,
629
+ self.kt_amx_method,
630
+ self.chunked_prefill_size,
631
+ )
632
+
633
+ def _handle_missing_default_values(self):
471
634
  if self.tokenizer_path is None:
472
635
  self.tokenizer_path = self.model_path
473
-
474
636
  if self.served_model_name is None:
475
637
  self.served_model_name = self.model_path
476
638
  if self.device is None:
@@ -478,56 +640,165 @@ class ServerArgs:
478
640
  if self.random_seed is None:
479
641
  self.random_seed = random.randint(0, 1 << 30)
480
642
 
481
- gpu_mem = get_device_memory_capacity(self.device)
482
-
483
- # Set mem fraction static
484
- if self.mem_fraction_static is None:
485
- if gpu_mem is not None:
486
- # GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
487
- # mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
488
-
489
- # We want mem_fraction_static to be as large as possible but still has enough room
490
- # for activations and cuda graph buffers. We use the following heuristic to
491
- # compute the needed size for activations and cuda graph buffers:
492
- # - The size of the activation depends on the chunked_prefill_size and model size.
493
- # - The size of cuda graph buffers depends on the cuda graph capture range and model size.
494
- # For GPUs with more memory, we use a larger chunked_prefill_size and
495
- # capture more cuda graphs, so they need to reserve more memory.
496
- parallel_size = self.tp_size * self.pp_size
497
-
498
- if gpu_mem < 20 * 1024:
499
- # T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
500
- reserved_mem = (2.8 + parallel_size / 10) * 1024
501
- elif gpu_mem < 35 * 1024:
502
- # A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
503
- reserved_mem = (2.8 + parallel_size / 10) * 1024
504
- elif gpu_mem < 90 * 1024:
505
- # H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
506
- reserved_mem = (9.5 + parallel_size / 2) * 1024
507
- elif gpu_mem < 100 * 1024:
508
- # H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
509
- reserved_mem = (12 + parallel_size / 2) * 1024
510
- elif gpu_mem < 160 * 1024:
511
- # H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
512
- reserved_mem = (12 + parallel_size / 2) * 1024
513
- else:
514
- # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
515
- reserved_mem = 32 * 1024
516
-
517
- # draft model and larger cuda graph buffers
518
- if self.speculative_algorithm is not None:
519
- if self.speculative_algorithm == "STANDALONE":
520
- # Standalone speculative decoding needs more memory than other speculative
521
- # decoding algorithms since the draft model is typically larger.
522
- reserved_mem += 6 * 1024
643
+ def _handle_gpu_memory_settings(self, gpu_mem):
644
+ """
645
+ Configure GPU memory-dependent settings including
646
+ chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static.
647
+
648
+ Here are our heuristics:
649
+ - Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity.
650
+ This is because GPUs with more memory are generally more powerful, we need to use a larger
651
+ chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU.
652
+ - Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs.
653
+
654
+ GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
655
+
656
+ The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity,
657
+ or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity.
658
+
659
+ In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers.
660
+ The activation memory is proportional to the chunked_prefill_size.
661
+ The cuda graph memory is proportional to the cuda_graph_max_bs.
662
+ We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB.
663
+ and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity.
664
+
665
+ The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run.
666
+ """
667
+ if gpu_mem is not None:
668
+ if gpu_mem < 20 * 1024:
669
+ # T4, 4080
670
+ # (chunked_prefill_size 2k, cuda_graph_max_bs 8)
671
+ if self.chunked_prefill_size is None:
672
+ self.chunked_prefill_size = 2048
673
+ if self.cuda_graph_max_bs is None:
674
+ self.cuda_graph_max_bs = 8
675
+ elif is_npu() and gpu_mem < 32 * 1024:
676
+ # Atlas A2B4
677
+ # (chunked_prefill_size 32k, cuda_graph_max_bs 16 if tp < 4 else 64)
678
+ if self.chunked_prefill_size is None:
679
+ self.chunked_prefill_size = 32768
680
+ if self.cuda_graph_max_bs is None:
681
+ if self.tp_size < 4:
682
+ self.cuda_graph_max_bs = 16
683
+ else:
684
+ self.cuda_graph_max_bs = 64
685
+ elif gpu_mem < 35 * 1024:
686
+ # A10, 4090, 5090
687
+ # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
688
+ if self.chunked_prefill_size is None:
689
+ self.chunked_prefill_size = 2048
690
+ if self.cuda_graph_max_bs is None:
691
+ # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM < 35GB, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
692
+ # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
693
+ # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
694
+ if self.tp_size < 4:
695
+ self.cuda_graph_max_bs = 16
696
+ else:
697
+ self.cuda_graph_max_bs = 80
698
+ elif gpu_mem < 60 * 1024:
699
+ # A100 (40GB), L40,
700
+ # (chunked_prefill_size 4k, cuda_graph_max_bs 32 if tp < 4 else 160)
701
+ if self.chunked_prefill_size is None:
702
+ self.chunked_prefill_size = 4096
703
+ if self.cuda_graph_max_bs is None:
704
+ if self.tp_size < 4:
705
+ self.cuda_graph_max_bs = 32
523
706
  else:
524
- reserved_mem += 2 * 1024
525
- if self.enable_dp_attention:
526
- reserved_mem += 4 * 1024
707
+ self.cuda_graph_max_bs = 160
708
+ elif is_npu() and gpu_mem < 64 * 1024:
709
+ # Atlas A2 and Atlas A3
710
+ # (chunked_prefill_size 32k, cuda_graph_max_bs 64 if tp < 4 else 128)
711
+ if self.chunked_prefill_size is None:
712
+ self.chunked_prefill_size = 32768
713
+ if self.cuda_graph_max_bs is None:
714
+ if self.tp_size < 4:
715
+ self.cuda_graph_max_bs = 64
716
+ else:
717
+ self.cuda_graph_max_bs = 128
718
+ elif gpu_mem < 90 * 1024:
719
+ # H100, A100
720
+ # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
721
+ if self.chunked_prefill_size is None:
722
+ self.chunked_prefill_size = 8192
723
+ if self.cuda_graph_max_bs is None:
724
+ if self.tp_size < 4:
725
+ self.cuda_graph_max_bs = 256
726
+ else:
727
+ self.cuda_graph_max_bs = 512
728
+ elif gpu_mem < 160 * 1024:
729
+ # H20, H200
730
+ # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
731
+ if self.chunked_prefill_size is None:
732
+ self.chunked_prefill_size = 8192
733
+ if self.cuda_graph_max_bs is None:
734
+ if self.tp_size < 4:
735
+ self.cuda_graph_max_bs = 256
736
+ else:
737
+ self.cuda_graph_max_bs = 512
738
+ else:
739
+ # B200, MI300
740
+ # (chunked_prefill_size 16k, cuda_graph_max_bs 512)
741
+ if self.chunked_prefill_size is None:
742
+ self.chunked_prefill_size = 16384
743
+ if self.cuda_graph_max_bs is None:
744
+ self.cuda_graph_max_bs = 512
745
+ else:
746
+ # Fallback defaults when gpu_mem is None
747
+ if self.chunked_prefill_size is None:
748
+ self.chunked_prefill_size = 4096
749
+ if self.cuda_graph_max_bs is None:
750
+ self.cuda_graph_max_bs = 160
751
+
752
+ # Set cuda graph batch sizes
753
+ if self.cuda_graph_bs is None:
754
+ self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes()
755
+ else:
756
+ self.cuda_graph_max_bs = max(self.cuda_graph_bs)
527
757
 
528
- self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
758
+ if self.piecewise_cuda_graph_tokens is None:
759
+ self.piecewise_cuda_graph_tokens = (
760
+ self._generate_piecewise_cuda_graph_tokens()
761
+ )
762
+
763
+ if self.mem_fraction_static is None:
764
+ # Constant meta data (e.g., from attention backend)
765
+ reserved_mem = 512
766
+ # For activation during large prefill
767
+ if self.chunked_prefill_size > 0:
768
+ reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
529
769
  else:
530
- self.mem_fraction_static = 0.88
770
+ reserved_mem += max(self.max_prefill_tokens, 2048) * 1.5
771
+ # For cuda graphs
772
+ reserved_mem += self.cuda_graph_max_bs * 2
773
+ # Some adjustments for large parallel size
774
+ reserved_mem += self.tp_size * self.pp_size / 8 * 1024
775
+
776
+ if self.enable_dp_attention:
777
+ # DP attention needs more padding for some operations
778
+ reserved_mem += self.cuda_graph_max_bs * self.dp_size * 3
779
+
780
+ # DP attention uses much more memory for large cuda graph max bs,
781
+ # likely due to some inefficiencies in torch allocator or our implementation.
782
+ # So we need to reserve more memory.
783
+ if self.cuda_graph_max_bs > 300:
784
+ reserved_mem += self.cuda_graph_max_bs * self.dp_size * 1.5
785
+
786
+ if gpu_mem is not None and gpu_mem > 60 * 1024:
787
+ reserved_mem = max(reserved_mem, 10 * 1024)
788
+
789
+ if self.speculative_algorithm is not None:
790
+ if self.speculative_algorithm == "STANDALONE":
791
+ # standalonedraft model and cuda graphs
792
+ reserved_mem += 6 * 1024
793
+ elif self.speculative_algorithm != "NGRAM":
794
+ # eagle draft models and cuda graphs
795
+ reserved_mem += 2 * 1024
796
+
797
+ self.mem_fraction_static = (
798
+ round((gpu_mem - reserved_mem) / gpu_mem, 3)
799
+ if gpu_mem is not None
800
+ else 0.88
801
+ )
531
802
 
532
803
  # Lazy init to avoid circular import
533
804
  # Multimodal models need more memory for the image processor
@@ -537,54 +808,266 @@ class ServerArgs:
537
808
  if model_config.is_multimodal:
538
809
  self.adjust_mem_fraction_for_vlm(model_config)
539
810
 
540
- # Set chunked prefill size, which depends on the gpu memory capacity
541
- if self.chunked_prefill_size is None:
542
- if gpu_mem is not None:
543
- if gpu_mem < 35 * 1024: # A10, L40, 4090
544
- self.chunked_prefill_size = 2048
545
- elif gpu_mem < 160 * 1024: # H100, H200, A100, H20
546
- self.chunked_prefill_size = 8192
547
- else: # B200, MI300
548
- self.chunked_prefill_size = 16384
549
- else:
550
- self.chunked_prefill_size = 4096
811
+ def _generate_cuda_graph_batch_sizes(self):
812
+ """
813
+ Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs.
814
+ This integrates the logic from cuda_graph_runner.py.
815
+ """
816
+ # Handle disable_cuda_graph_padding as the first condition for both spec and non-spec
817
+ if self.disable_cuda_graph_padding:
818
+ capture_bs = list(range(1, self.cuda_graph_max_bs + 1))
819
+ elif self.speculative_algorithm is None:
820
+ # Normal case: [1, 2, 4, 8, 12] + list(range(16, 257, 8)) + list(range(272, 512, 16)) + list(range(512, cuda_graph_max_bs + 1))
821
+ capture_bs = (
822
+ [1, 2, 4, 8, 12]
823
+ + list(range(16, 257, 8))
824
+ + list(range(272, 512, 16))
825
+ + list(range(512, self.cuda_graph_max_bs + 1, 32))
826
+ )
827
+ else:
828
+ # Spec decoding case: list(range(1, 9, 1)) + list(range(10, 33, 2)) + list(range(40, 64, 4)) + list(range(72, 257, 8))
829
+ capture_bs = (
830
+ list(range(1, 9, 1))
831
+ + list(range(10, 33, 2))
832
+ + list(range(40, 64, 4))
833
+ + list(range(72, 257, 8))
834
+ + list(range(272, self.cuda_graph_max_bs + 1, 16))
835
+ )
551
836
 
552
- # Set cuda graph max batch size
553
- if self.cuda_graph_max_bs is None:
554
- # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
555
- if gpu_mem is not None and gpu_mem < 35 * 1024:
556
- if self.tp_size < 4:
557
- self.cuda_graph_max_bs = 8
558
- else:
559
- self.cuda_graph_max_bs = 80
837
+ capture_bs = [bs for bs in capture_bs if bs <= self.cuda_graph_max_bs]
838
+
839
+ return capture_bs
840
+
841
+ def _generate_piecewise_cuda_graph_tokens(self):
842
+ """
843
+ Generate the list of batch sizes for piecewise CUDA graph capture
844
+ based on piecewise_cuda_graph_max_tokens.
845
+ """
846
+ capture_sizes = (
847
+ list(range(4, 33, 4))
848
+ + list(range(48, 257, 16))
849
+ + list(range(288, 513, 32))
850
+ + list(range(640, 4096 + 1, 128))
851
+ + list(range(4352, self.piecewise_cuda_graph_max_tokens + 1, 256))
852
+ )
853
+
854
+ capture_sizes = [
855
+ s for s in capture_sizes if s <= self.piecewise_cuda_graph_max_tokens
856
+ ]
560
857
 
561
- # Set kernel backends for hpu device
858
+ return capture_sizes
859
+
860
+ def _handle_hpu_backends(self):
562
861
  if self.device == "hpu":
563
862
  self.attention_backend = "torch_native"
564
863
  self.sampling_backend = "pytorch"
565
864
 
566
- # Model-specific adjustments
567
- if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
568
- self.model_specific_adjustments()
569
-
570
- # Set kernel backends
865
+ def _handle_cpu_backends(self):
571
866
  if self.device == "cpu":
572
867
  if self.attention_backend is None:
573
868
  self.attention_backend = "intel_amx"
574
869
  self.sampling_backend = "pytorch"
575
870
 
871
+ def _handle_model_specific_adjustments(self):
872
+ from sglang.srt.configs.model_config import is_deepseek_nsa
873
+
874
+ if parse_connector_type(self.model_path) == ConnectorType.INSTANCE:
875
+ return
876
+
877
+ hf_config = self.get_hf_config()
878
+ model_arch = hf_config.architectures[0]
879
+ if model_arch in ["DeepseekV3ForCausalLM"] and not is_deepseek_nsa(hf_config):
880
+ if is_cuda() and is_sm100_supported():
881
+ if (
882
+ self.attention_backend is None
883
+ and self.prefill_attention_backend is None
884
+ and self.decode_attention_backend is None
885
+ ):
886
+ self.attention_backend = "trtllm_mla"
887
+ logger.info(
888
+ "Use trtllm_mla as attention backend on sm100 for DeepseekV3ForCausalLM"
889
+ )
890
+ if not self.enable_dp_attention:
891
+ self.enable_flashinfer_allreduce_fusion = True
892
+ logger.info(
893
+ "Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
894
+ )
895
+ if (
896
+ self.quantization == "modelopt_fp4"
897
+ and self.moe_runner_backend == "auto"
898
+ ):
899
+ self.moe_runner_backend = "flashinfer_trtllm"
900
+ logger.info(
901
+ "Use flashinfer_trtllm as moe runner backend on sm100 for DeepseekV3ForCausalLM"
902
+ )
903
+
904
+ elif model_arch in ["GptOssForCausalLM"]:
905
+ if (
906
+ self.attention_backend is None
907
+ and self.prefill_attention_backend is None
908
+ and self.decode_attention_backend is None
909
+ ):
910
+ if is_cuda() and is_sm100_supported():
911
+ self.attention_backend = "trtllm_mha"
912
+ elif is_cuda() and is_sm90_supported():
913
+ self.attention_backend = "fa3"
914
+ else:
915
+ self.attention_backend = "triton"
916
+
917
+ supported_backends = ["triton", "trtllm_mha", "fa3", "fa4"]
918
+ prefill_attn_backend, decode_attn_backend = self.get_attention_backends()
919
+ assert (
920
+ prefill_attn_backend in supported_backends
921
+ and decode_attn_backend in supported_backends
922
+ ), (
923
+ f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got the following backends\n"
924
+ f"- Prefill: {prefill_attn_backend}\n"
925
+ f"- Decode: {decode_attn_backend}\n"
926
+ )
927
+
928
+ if is_sm100_supported():
929
+ if not self.enable_dp_attention:
930
+ self.enable_flashinfer_allreduce_fusion = True
931
+ logger.info(
932
+ "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
933
+ )
934
+ quantization_config = getattr(hf_config, "quantization_config", None)
935
+ is_mxfp4_quant_format = (
936
+ quantization_config is not None
937
+ and quantization_config.get("quant_method") == "mxfp4"
938
+ )
939
+
940
+ if is_sm100_supported() and is_mxfp4_quant_format:
941
+ self.moe_runner_backend = "flashinfer_mxfp4"
942
+ logger.warning(
943
+ "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
944
+ )
945
+ else:
946
+ if self.moe_runner_backend == "triton_kernel":
947
+ assert (
948
+ self.ep_size == 1
949
+ ), "Triton kernel MoE is only supported when ep_size == 1"
950
+ if (
951
+ self.moe_runner_backend == "auto"
952
+ and self.ep_size == 1
953
+ and is_triton_kernels_available()
954
+ ):
955
+ self.moe_runner_backend = "triton_kernel"
956
+ logger.warning(
957
+ "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
958
+ )
959
+ self.disable_hybrid_swa_memory = True
960
+ if is_mxfp4_quant_format:
961
+ # use bf16 for mxfp4 triton kernels
962
+ self.dtype = "bfloat16"
963
+
964
+ elif "Llama4" in model_arch and self.device != "cpu":
965
+ assert self.attention_backend in {
966
+ "fa3",
967
+ "aiter",
968
+ "triton",
969
+ }, "fa3, aiter, or triton is required for Llama4 model"
970
+ elif model_arch in [
971
+ "Gemma2ForCausalLM",
972
+ "Gemma3ForCausalLM",
973
+ "Gemma3ForConditionalGeneration",
974
+ "Gemma3nForCausalLM",
975
+ "Gemma3nForConditionalGeneration",
976
+ ]:
977
+ # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
978
+ # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
979
+ logger.warning(
980
+ f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
981
+ )
982
+ self.disable_hybrid_swa_memory = True
983
+ elif model_arch in ["Olmo2ForCausalLM"]:
984
+ # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with Olmo3 model.
985
+ logger.warning(
986
+ f"Disabling hybrid SWA memory for {model_arch} as it is not yet supported."
987
+ )
988
+ self.disable_hybrid_swa_memory = True
989
+
990
+ if self.attention_backend is None:
991
+ if is_cuda() and is_sm100_supported():
992
+ self.attention_backend = "trtllm_mha"
993
+ elif is_cuda() and get_device_sm() >= 80:
994
+ self.attention_backend = "fa3"
995
+ else:
996
+ self.attention_backend = "triton"
997
+
998
+ # Flashinfer appears to degrade performance when sliding window attention
999
+ # is used for the Olmo2 architecture. Olmo2 does not use sliding window attention
1000
+ # but Olmo3 does.
1001
+ assert (
1002
+ self.attention_backend != "flashinfer"
1003
+ ), "FlashInfer backend can significantly degrade the performance of Olmo3 models."
1004
+
1005
+ logger.info(
1006
+ f"Using {self.attention_backend} as attention backend for {model_arch}."
1007
+ )
1008
+
1009
+ if is_deepseek_nsa(hf_config):
1010
+ if (
1011
+ self.attention_backend is None
1012
+ and self.prefill_attention_backend is None
1013
+ and self.decode_attention_backend is None
1014
+ ):
1015
+ self.attention_backend = "nsa"
1016
+ logger.warning("Set nsa attention backend for DeepSeek NSA.")
1017
+
1018
+ if not is_npu():
1019
+ self.enable_dp_attention = True
1020
+ self.dp_size = self.tp_size
1021
+ logger.warning("DP attention is enabled for DeepSeek NSA.")
1022
+
1023
+ self.page_size = 64
1024
+ logger.warning("Setting page size to 64 for DeepSeek NSA.")
1025
+
1026
+ # For Hopper, we support both bf16 and fp8 kv cache; for Blackwell, we support fp8 only currently
1027
+ import torch
1028
+
1029
+ major, _ = torch.cuda.get_device_capability()
1030
+ if major >= 10:
1031
+ self.kv_cache_dtype = "fp8_e4m3"
1032
+ logger.warning("Setting KV cache dtype to fp8.")
1033
+
1034
+ if self.kv_cache_dtype == "fp8_e4m3":
1035
+ self.nsa_prefill_backend = "flashmla_kv"
1036
+ self.nsa_decode_backend = "flashmla_kv"
1037
+ logger.warning(
1038
+ "Setting NSA backend to flashmla_kv for FP8 KV Cache."
1039
+ )
1040
+
1041
+ # Logging env vars for NSA
1042
+ from sglang.srt.layers.attention.nsa.utils import (
1043
+ print_nsa_bool_env_vars,
1044
+ )
1045
+
1046
+ print_nsa_bool_env_vars()
1047
+
1048
+ def _handle_sampling_backend(self):
576
1049
  if self.sampling_backend is None:
577
1050
  self.sampling_backend = (
578
1051
  "flashinfer" if is_flashinfer_available() else "pytorch"
579
1052
  )
580
1053
 
1054
+ def _handle_attention_backend_compatibility(self):
581
1055
  if self.attention_backend == "torch_native":
582
1056
  logger.warning(
583
1057
  "Cuda graph is disabled because of using torch native attention backend"
584
1058
  )
585
1059
  self.disable_cuda_graph = True
586
1060
 
587
- if is_npu() and self.attention_backend in ["ascend", "hybrid_linear_attn"]:
1061
+ if self.attention_backend == "flex_attention":
1062
+ logger.warning(
1063
+ "Cuda graph is disabled because of using torch Flex Attention backend"
1064
+ )
1065
+ self.disable_cuda_graph = True
1066
+ assert (
1067
+ self.speculative_algorithm is None
1068
+ ), "Speculative decoding is currently not supported with Flex Attention backend"
1069
+
1070
+ if is_npu() and self.attention_backend in ["ascend"]:
588
1071
  logger.warning(
589
1072
  "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
590
1073
  )
@@ -646,29 +1129,44 @@ class ServerArgs:
646
1129
 
647
1130
  if self.attention_backend == "dual_chunk_flash_attn":
648
1131
  logger.warning(
649
- "Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
1132
+ "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
650
1133
  )
651
1134
  self.enable_mixed_chunk = False
652
- self.disable_cuda_graph = True
653
1135
  self.disable_radix_cache = True
654
1136
 
655
- # Set page size
1137
+ if self.attention_backend == "intel_xpu":
1138
+ if self.page_size not in [32, 64, 128]:
1139
+ logger.warning(
1140
+ f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
1141
+ )
1142
+ self.page_size = 128
1143
+ if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
1144
+ raise ValueError(
1145
+ "FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
1146
+ )
1147
+ if self.prefill_attention_backend == "fa4":
1148
+ logger.warning(
1149
+ f"FA4 backend only supports page size 128, changing page_size from {self.page_size} to 128."
1150
+ )
1151
+ self.page_size = 128
1152
+
1153
+ def _handle_page_size(self):
656
1154
  if self.page_size is None:
657
1155
  self.page_size = 1
658
1156
 
659
- # AMD-specific Triton attention KV splits default number
1157
+ def _handle_amd_specifics(self):
660
1158
  if is_hip():
661
1159
  self.triton_attention_num_kv_splits = 16
662
1160
 
663
- # Choose grammar backend
1161
+ def _handle_grammar_backend(self):
664
1162
  if self.grammar_backend is None:
665
1163
  self.grammar_backend = "xgrammar"
666
1164
 
1165
+ def _handle_data_parallelism(self):
667
1166
  if self.dp_size == 1:
668
1167
  self.enable_dp_attention = False
669
1168
  self.enable_dp_lm_head = False
670
1169
 
671
- # Data parallelism attention
672
1170
  if self.enable_dp_attention:
673
1171
  self.schedule_conservativeness = self.schedule_conservativeness * 0.3
674
1172
  assert self.tp_size % self.dp_size == 0
@@ -682,7 +1180,7 @@ class ServerArgs:
682
1180
  self.enable_dp_attention
683
1181
  ), "Please enable dp attention when setting enable_dp_lm_head. "
684
1182
 
685
- # MoE kernel
1183
+ def _handle_moe_kernel_config(self):
686
1184
  if self.moe_runner_backend == "flashinfer_cutlass":
687
1185
  assert (
688
1186
  self.quantization == "modelopt_fp4"
@@ -695,13 +1193,13 @@ class ServerArgs:
695
1193
  if self.moe_runner_backend == "flashinfer_trtllm":
696
1194
  assert (
697
1195
  self.quantization == "modelopt_fp4" or self.quantization == "fp8"
698
- ), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
1196
+ ), "modelopt_fp4 or fp8 quantization is required for Flashinfer TRTLLM MoE"
699
1197
  self.disable_shared_experts_fusion = True
700
1198
  logger.warning(
701
1199
  "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
702
1200
  )
703
1201
 
704
- # DeepEP MoE
1202
+ def _handle_a2a_moe(self):
705
1203
  if self.moe_a2a_backend == "deepep":
706
1204
  if self.deepep_mode == "normal":
707
1205
  logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
@@ -711,6 +1209,13 @@ class ServerArgs:
711
1209
  f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
712
1210
  )
713
1211
 
1212
+ if self.moe_a2a_backend == "mooncake":
1213
+ self.ep_size = self.tp_size
1214
+ logger.warning(
1215
+ f"Mooncake MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
1216
+ )
1217
+
1218
+ def _handle_eplb_and_dispatch(self):
714
1219
  if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
715
1220
  self.expert_distribution_recorder_mode = "stat"
716
1221
  logger.warning(
@@ -725,6 +1230,16 @@ class ServerArgs:
725
1230
  if self.enable_eplb:
726
1231
  assert self.ep_size > 1
727
1232
 
1233
+ def _handle_elastic_ep(self):
1234
+ if self.elastic_ep_backend is not None:
1235
+ if self.enable_eplb:
1236
+ if self.eplb_algorithm == "auto":
1237
+ self.eplb_algorithm = "elasticity_aware"
1238
+ assert (
1239
+ self.eplb_algorithm == "elasticity_aware"
1240
+ ), "Elastic EP requires eplb_algorithm to be set to 'auto' or 'elasticity_aware'."
1241
+
1242
+ def _handle_expert_distribution_metrics(self):
728
1243
  if self.enable_expert_distribution_metrics and (
729
1244
  self.expert_distribution_recorder_mode is None
730
1245
  ):
@@ -736,18 +1251,24 @@ class ServerArgs:
736
1251
  elif self.expert_distribution_recorder_mode is not None:
737
1252
  self.expert_distribution_recorder_buffer_size = 1000
738
1253
 
739
- # Pipeline parallelism
1254
+ def _handle_pipeline_parallelism(self):
740
1255
  if self.pp_size > 1:
741
1256
  self.disable_overlap_schedule = True
742
1257
  logger.warning(
743
1258
  "Pipeline parallelism is incompatible with overlap schedule."
744
1259
  )
745
1260
 
746
- # Hicache
1261
+ def _handle_hicache(self):
747
1262
  if self.hicache_storage_backend == "mooncake":
748
- # to use mooncake storage backend, the following conditions must be met:
749
- self.hicache_io_backend = "kernel"
750
- self.hicache_mem_layout = "page_first"
1263
+ if self.hicache_mem_layout == "layer_first":
1264
+ if self.hicache_io_backend == "direct":
1265
+ self.hicache_mem_layout = "page_first_direct"
1266
+ elif self.hicache_io_backend == "kernel":
1267
+ self.hicache_mem_layout = "page_first"
1268
+ logger.warning(
1269
+ f"Mooncake storage backend does not support layer_first layout, "
1270
+ f"switching to {self.hicache_mem_layout} layout for {self.hicache_io_backend} io backend"
1271
+ )
751
1272
 
752
1273
  if self.hicache_mem_layout == "page_first_direct":
753
1274
  if self.hicache_io_backend != "direct":
@@ -756,24 +1277,34 @@ class ServerArgs:
756
1277
  "Page first direct layout only support direct io backend"
757
1278
  )
758
1279
 
759
- # Speculative Decoding
1280
+ def _handle_speculative_decoding(self):
760
1281
  if self.speculative_algorithm == "NEXTN":
761
- # NEXTN shares the same implementation of EAGLE
762
1282
  self.speculative_algorithm = "EAGLE"
763
1283
 
764
1284
  if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
765
- if self.speculative_algorithm == "STANDALONE":
1285
+ if self.speculative_algorithm == "STANDALONE" and self.enable_dp_attention:
766
1286
  # TODO: support dp attention for standalone speculative decoding
767
- assert (
768
- self.enable_dp_attention is False
769
- ), "Currently standalone speculative decoding does not support dp attention."
1287
+ raise ValueError(
1288
+ "Currently standalone speculative decoding does not support dp attention."
1289
+ )
770
1290
  if self.max_running_requests is None:
771
1291
  self.max_running_requests = 48
772
- self.disable_overlap_schedule = True
773
- logger.warning(
774
- "Overlap scheduler is disabled because of using "
775
- "eagle speculative decoding."
776
- )
1292
+ logger.warning(
1293
+ "Max running requests is reset to 48 for speculative decoding."
1294
+ )
1295
+
1296
+ if self.speculative_algorithm == "EAGLE" and self.enable_beta_spec:
1297
+ self.disable_overlap_schedule = False
1298
+ logger.warning(
1299
+ "Beta spec is enabled for eagle speculative decoding and overlap schedule is turned on."
1300
+ )
1301
+
1302
+ if not self.enable_beta_spec:
1303
+ self.disable_overlap_schedule = True
1304
+ logger.warning(
1305
+ "Overlap scheduler is disabled because of using eagle3 and standalone speculative decoding."
1306
+ )
1307
+
777
1308
  if self.enable_mixed_chunk:
778
1309
  self.enable_mixed_chunk = False
779
1310
  logger.warning(
@@ -783,12 +1314,12 @@ class ServerArgs:
783
1314
 
784
1315
  model_arch = self.get_hf_config().architectures[0]
785
1316
  if model_arch in [
1317
+ "DeepseekV32ForCausalLM",
786
1318
  "DeepseekV3ForCausalLM",
787
1319
  "Glm4MoeForCausalLM",
788
1320
  "BailingMoeForCausalLM",
789
1321
  "BailingMoeV2ForCausalLM",
790
1322
  ]:
791
- # Auto set draft_model_path DeepSeek-V3/R1
792
1323
  if self.speculative_draft_model_path is None:
793
1324
  self.speculative_draft_model_path = self.model_path
794
1325
  else:
@@ -796,7 +1327,6 @@ class ServerArgs:
796
1327
  "DeepSeek MTP does not require setting speculative_draft_model_path."
797
1328
  )
798
1329
 
799
- # Auto choose parameters
800
1330
  if self.speculative_num_steps is None:
801
1331
  assert (
802
1332
  self.speculative_eagle_topk is None
@@ -836,11 +1366,43 @@ class ServerArgs:
836
1366
  "speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
837
1367
  )
838
1368
 
839
- # The token generated from the verify step is counted.
840
- # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
841
- # assert self.speculative_num_steps < self.speculative_num_draft_tokens
1369
+ if self.speculative_algorithm == "NGRAM":
1370
+ if not self.device.startswith("cuda"):
1371
+ raise ValueError(
1372
+ "Ngram speculative decoding only supports CUDA device."
1373
+ )
1374
+ if self.max_running_requests is None:
1375
+ self.max_running_requests = 48
1376
+ self.disable_overlap_schedule = True
1377
+ self.enable_mixed_chunk = False
1378
+ self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
1379
+ if self.speculative_num_draft_tokens is None:
1380
+ self.speculative_num_draft_tokens = (
1381
+ self.speculative_ngram_max_match_window_size
1382
+ )
1383
+ logger.warning(
1384
+ "The overlap scheduler and mixed chunked prefill are disabled because of "
1385
+ "using ngram speculative decoding."
1386
+ )
842
1387
 
843
- # GGUF
1388
+ if (
1389
+ self.speculative_eagle_topk > 1
1390
+ and self.page_size > 1
1391
+ and self.attention_backend != "flashinfer"
1392
+ ):
1393
+ raise ValueError(
1394
+ f"speculative_eagle_topk({self.speculative_eagle_topk}) > 1 "
1395
+ f"with page_size({self.page_size}) > 1 is unstable "
1396
+ "and produces incorrect results for paged attention backends. "
1397
+ "This combination is only supported for the 'flashinfer' backend."
1398
+ )
1399
+ if self.enable_dp_attention:
1400
+ # TODO: support dp attention for ngram speculative decoding
1401
+ raise ValueError(
1402
+ "Currently ngram speculative decoding does not support dp attention."
1403
+ )
1404
+
1405
+ def _handle_load_format(self):
844
1406
  if (
845
1407
  self.load_format == "auto" or self.load_format == "gguf"
846
1408
  ) and check_gguf_file(self.model_path):
@@ -848,6 +1410,7 @@ class ServerArgs:
848
1410
 
849
1411
  if is_remote_url(self.model_path):
850
1412
  self.load_format = "remote"
1413
+
851
1414
  if self.custom_weight_loader is None:
852
1415
  self.custom_weight_loader = []
853
1416
 
@@ -859,7 +1422,7 @@ class ServerArgs:
859
1422
  ):
860
1423
  self.load_format = "auto"
861
1424
 
862
- # PD disaggregation
1425
+ def _handle_disaggregation(self):
863
1426
  if self.disaggregation_mode == "decode":
864
1427
  assert (
865
1428
  self.disaggregation_decode_tp is None
@@ -885,44 +1448,121 @@ class ServerArgs:
885
1448
 
886
1449
  self.disaggregation_prefill_pp = self.pp_size
887
1450
  self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
888
-
889
1451
  self.disable_cuda_graph = True
890
1452
  logger.warning("Cuda graph is disabled for prefill server")
891
1453
 
892
- # Validation: prevent both tokenizer batching features from being enabled
1454
+ def _handle_tokenizer_batching(self):
893
1455
  if self.enable_tokenizer_batch_encode and self.enable_dynamic_batch_tokenizer:
894
1456
  raise ValueError(
895
1457
  "Cannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. "
896
1458
  "Please choose one tokenizer batching approach."
897
1459
  )
898
1460
 
899
- # Propagate env vars
1461
+ if self.skip_tokenizer_init:
1462
+ if self.tokenizer_worker_num != 1:
1463
+ logger.warning(
1464
+ "skip_tokenizer_init=True disables tokenizer workers; forcing tokenizer_worker_num=1 "
1465
+ f"(requested {self.tokenizer_worker_num})."
1466
+ )
1467
+ self.tokenizer_worker_num = 1
1468
+
1469
+ if self.enable_tokenizer_batch_encode:
1470
+ logger.warning(
1471
+ "skip_tokenizer_init=True ignores --enable-tokenizer-batch-encode; disabling it."
1472
+ )
1473
+ self.enable_tokenizer_batch_encode = False
1474
+
1475
+ if self.enable_dynamic_batch_tokenizer:
1476
+ logger.warning(
1477
+ "skip_tokenizer_init=True ignores --enable-dynamic-batch-tokenizer; disabling it."
1478
+ )
1479
+ self.enable_dynamic_batch_tokenizer = False
1480
+
1481
+ def _handle_environment_variables(self):
900
1482
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
901
1483
  "1" if self.enable_torch_compile else "0"
902
1484
  )
903
1485
  os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
904
-
905
- # Set env var before grammar backends init
906
1486
  os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
907
1487
  "1" if self.disable_outlines_disk_cache else "0"
908
1488
  )
1489
+ os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = (
1490
+ "1" if self.enable_deterministic_inference else "0"
1491
+ )
909
1492
 
1493
+ def _handle_cache_compatibility(self):
910
1494
  if self.enable_hierarchical_cache and self.disable_radix_cache:
911
1495
  raise ValueError(
912
1496
  "The arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive "
913
1497
  "and cannot be used at the same time. Please use only one of them."
914
1498
  )
915
1499
 
1500
+ if (
1501
+ self.disaggregation_decode_enable_offload_kvcache
1502
+ and self.disaggregation_mode != "decode"
1503
+ ):
1504
+ raise ValueError(
1505
+ "The argument disaggregation-decode-enable-offload-kvcache is only supported for decode side."
1506
+ )
1507
+
1508
+ def _handle_metrics_labels(self):
916
1509
  if (
917
1510
  not self.tokenizer_metrics_custom_labels_header
918
- and self.tokenizer_metrics_allowed_customer_labels
1511
+ and self.tokenizer_metrics_allowed_custom_labels
919
1512
  ):
920
1513
  raise ValueError(
921
- "Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-customer-labels."
1514
+ "Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-custom-labels."
922
1515
  )
923
1516
 
1517
+ def _handle_deterministic_inference(self):
1518
+ if self.enable_deterministic_inference:
1519
+ # Check sampling backend
1520
+ self.sampling_backend = "pytorch"
1521
+ logger.warning(
1522
+ "Sampling backend is set to pytorch for deterministic inference."
1523
+ )
1524
+
1525
+ # Check attention backend
1526
+ if self.attention_backend is None:
1527
+ # User didn't specify attention backend, fallback based on GPU architecture
1528
+ if is_sm100_supported() or is_sm120_supported():
1529
+ # Blackwell and newer architectures
1530
+ self.attention_backend = "flashinfer"
1531
+ else:
1532
+ # Hopper (SM90) and older architectures
1533
+ self.attention_backend = "fa3"
1534
+ logger.warning(
1535
+ f"Attention backend not specified. Falling back to '{self.attention_backend}' for deterministic inference. "
1536
+ f"You can explicitly set --attention-backend to one of {DETERMINISTIC_ATTENTION_BACKEND_CHOICES}."
1537
+ )
1538
+ elif self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
1539
+ # User explicitly specified an incompatible attention backend
1540
+ raise ValueError(
1541
+ f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference, "
1542
+ f"but you explicitly specified '{self.attention_backend}'."
1543
+ )
1544
+
1545
+ # Currently, only FA3 and Triton supports radix cache. Support for other backends is in progress
1546
+ if self.attention_backend not in ["fa3", "triton"]:
1547
+ self.disable_radix_cache = True
1548
+ logger.warning(
1549
+ f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
1550
+ )
1551
+
1552
+ # Check TP size
1553
+ if self.tp_size > 1:
1554
+ os.environ["NCCL_ALGO"] = "allreduce:tree"
1555
+ self.disable_custom_all_reduce = True
1556
+ logger.warning(
1557
+ "NCCL_ALGO is set to 'allreduce:tree' and custom all reduce is disabled for deterministic inference when TP size > 1."
1558
+ )
1559
+
1560
+ def _handle_other_validations(self):
1561
+ pass
1562
+
924
1563
  @staticmethod
925
1564
  def add_cli_args(parser: argparse.ArgumentParser):
1565
+
926
1566
  # Model and tokenizer
927
1567
  parser.add_argument(
928
1568
  "--model-path",
@@ -931,24 +1571,6 @@ class ServerArgs:
931
1571
  help="The path of the model weights. This can be a local folder or a Hugging Face repo ID.",
932
1572
  required=True,
933
1573
  )
934
- parser.add_argument(
935
- "--remote-instance-weight-loader-seed-instance-ip",
936
- type=str,
937
- default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
938
- help="The ip of the seed instance for loading weights from remote instance.",
939
- )
940
- parser.add_argument(
941
- "--remote-instance-weight-loader-seed-instance-service-port",
942
- type=int,
943
- default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
944
- help="The service port of the seed instance for loading weights from remote instance.",
945
- )
946
- parser.add_argument(
947
- "--remote-instance-weight-loader-send-weights-group-ports",
948
- type=json_list_type,
949
- default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
950
- help="The communication group ports for loading weights from remote instance.",
951
- )
952
1574
  parser.add_argument(
953
1575
  "--tokenizer-path",
954
1576
  type=str,
@@ -1060,6 +1682,11 @@ class ServerArgs:
1060
1682
  default=ServerArgs.port,
1061
1683
  help="The port of the HTTP server.",
1062
1684
  )
1685
+ parser.add_argument(
1686
+ "--grpc-mode",
1687
+ action="store_true",
1688
+ help="If set, use gRPC server instead of HTTP server.",
1689
+ )
1063
1690
  parser.add_argument(
1064
1691
  "--skip-server-warmup",
1065
1692
  action="store_true",
@@ -1078,6 +1705,12 @@ class ServerArgs:
1078
1705
  default=ServerArgs.nccl_port,
1079
1706
  help="The port for NCCL distributed environment setup. Defaults to a random port.",
1080
1707
  )
1708
+ parser.add_argument(
1709
+ "--checkpoint-engine-wait-weights-before-ready",
1710
+ action="store_true",
1711
+ help="If set, the server will wait for initial weights to be loaded via checkpoint-engine or other update methods "
1712
+ "before serving inference requests.",
1713
+ )
1081
1714
 
1082
1715
  # Quantization and data type
1083
1716
  parser.add_argument(
@@ -1110,12 +1743,56 @@ class ServerArgs:
1110
1743
  "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
1111
1744
  "default to 1.0, which may cause accuracy issues. ",
1112
1745
  )
1746
+ parser.add_argument(
1747
+ "--modelopt-quant",
1748
+ type=str,
1749
+ default=ServerArgs.modelopt_quant,
1750
+ help="The ModelOpt quantization configuration. "
1751
+ "Supported values: 'fp8', 'int4_awq', 'w4a8_awq', 'nvfp4', 'nvfp4_awq'. "
1752
+ "This requires the NVIDIA Model Optimizer library to be installed: pip install nvidia-modelopt",
1753
+ )
1754
+ parser.add_argument(
1755
+ "--modelopt-checkpoint-restore-path",
1756
+ type=str,
1757
+ default=ServerArgs.modelopt_checkpoint_restore_path,
1758
+ help="Path to restore a previously saved ModelOpt quantized checkpoint. "
1759
+ "If provided, the quantization process will be skipped and the model "
1760
+ "will be loaded from this checkpoint.",
1761
+ )
1762
+ parser.add_argument(
1763
+ "--modelopt-checkpoint-save-path",
1764
+ type=str,
1765
+ default=ServerArgs.modelopt_checkpoint_save_path,
1766
+ help="Path to save the ModelOpt quantized checkpoint after quantization. "
1767
+ "This allows reusing the quantized model in future runs.",
1768
+ )
1769
+ parser.add_argument(
1770
+ "--modelopt-export-path",
1771
+ type=str,
1772
+ default=ServerArgs.modelopt_export_path,
1773
+ help="Path to export the quantized model in HuggingFace format after ModelOpt quantization. "
1774
+ "The exported model can then be used directly with SGLang for inference. "
1775
+ "If not provided, the model will not be exported.",
1776
+ )
1777
+ parser.add_argument(
1778
+ "--quantize-and-serve",
1779
+ action="store_true",
1780
+ default=ServerArgs.quantize_and_serve,
1781
+ help="Quantize the model with ModelOpt and immediately serve it without exporting. "
1782
+ "This is useful for development and prototyping. For production, it's recommended "
1783
+ "to use separate quantization and deployment steps.",
1784
+ )
1113
1785
  parser.add_argument(
1114
1786
  "--kv-cache-dtype",
1115
1787
  type=str,
1116
1788
  default=ServerArgs.kv_cache_dtype,
1117
- choices=["auto", "fp8_e5m2", "fp8_e4m3"],
1118
- help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
1789
+ choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
1790
+ help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
1791
+ )
1792
+ parser.add_argument(
1793
+ "--enable-fp32-lm-head",
1794
+ action="store_true",
1795
+ help="If set, the LM head outputs (logits) are in FP32.",
1119
1796
  )
1120
1797
 
1121
1798
  # Memory and scheduling
@@ -1163,6 +1840,30 @@ class ServerArgs:
1163
1840
  choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
1164
1841
  help="The scheduling policy of the requests.",
1165
1842
  )
1843
+ parser.add_argument(
1844
+ "--enable-priority-scheduling",
1845
+ action="store_true",
1846
+ default=ServerArgs.enable_priority_scheduling,
1847
+ help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
1848
+ )
1849
+ parser.add_argument(
1850
+ "--abort-on-priority-when-disabled",
1851
+ action="store_true",
1852
+ default=ServerArgs.abort_on_priority_when_disabled,
1853
+ help="If set, abort requests that specify a priority when priority scheduling is disabled.",
1854
+ )
1855
+ parser.add_argument(
1856
+ "--schedule-low-priority-values-first",
1857
+ action="store_true",
1858
+ default=ServerArgs.schedule_low_priority_values_first,
1859
+ help="If specified with --enable-priority-scheduling, the scheduler will schedule requests with lower priority integer values first.",
1860
+ )
1861
+ parser.add_argument(
1862
+ "--priority-scheduling-preemption-threshold",
1863
+ type=int,
1864
+ default=ServerArgs.priority_scheduling_preemption_threshold,
1865
+ help="Minimum difference in priorities for an incoming request to have to preempt running request(s).",
1866
+ )
1166
1867
  parser.add_argument(
1167
1868
  "--schedule-conservativeness",
1168
1869
  type=float,
@@ -1207,6 +1908,21 @@ class ServerArgs:
1207
1908
  default=ServerArgs.device,
1208
1909
  help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
1209
1910
  )
1911
+ parser.add_argument(
1912
+ "--elastic-ep-backend",
1913
+ type=str,
1914
+ default=ServerArgs.elastic_ep_backend,
1915
+ choices=["none", "mooncake"],
1916
+ help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
1917
+ )
1918
+ parser.add_argument(
1919
+ "--mooncake-ib-device",
1920
+ type=str,
1921
+ default=ServerArgs.mooncake_ib_device,
1922
+ help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
1923
+ "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
1924
+ "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
1925
+ )
1210
1926
  parser.add_argument(
1211
1927
  "--tensor-parallel-size",
1212
1928
  "--tp-size",
@@ -1222,9 +1938,9 @@ class ServerArgs:
1222
1938
  help="The pipeline parallelism size.",
1223
1939
  )
1224
1940
  parser.add_argument(
1225
- "--max-micro-batch-size",
1941
+ "--pp-max-micro-batch-size",
1226
1942
  type=int,
1227
- default=ServerArgs.max_micro_batch_size,
1943
+ default=ServerArgs.pp_max_micro_batch_size,
1228
1944
  help="The maximum micro batch size in pipeline parallelism.",
1229
1945
  )
1230
1946
  parser.add_argument(
@@ -1248,7 +1964,12 @@ class ServerArgs:
1248
1964
  "--constrained-json-whitespace-pattern",
1249
1965
  type=str,
1250
1966
  default=ServerArgs.constrained_json_whitespace_pattern,
1251
- help="(outlines backend only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
1967
+ help="(outlines and llguidance backends only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
1968
+ )
1969
+ parser.add_argument(
1970
+ "--constrained-json-disable-any-whitespace",
1971
+ action="store_true",
1972
+ help="(xgrammar and llguidance backends only) Enforce compact representation in JSON constrained output.",
1252
1973
  )
1253
1974
  parser.add_argument(
1254
1975
  "--watchdog-timeout",
@@ -1338,16 +2059,16 @@ class ServerArgs:
1338
2059
  "--tokenizer-metrics-custom-labels-header",
1339
2060
  type=str,
1340
2061
  default=ServerArgs.tokenizer_metrics_custom_labels_header,
1341
- help="Specify the HTTP header for passing customer labels for tokenizer metrics.",
2062
+ help="Specify the HTTP header for passing custom labels for tokenizer metrics.",
1342
2063
  )
1343
2064
  parser.add_argument(
1344
- "--tokenizer-metrics-allowed-customer-labels",
2065
+ "--tokenizer-metrics-allowed-custom-labels",
1345
2066
  type=str,
1346
2067
  nargs="+",
1347
- default=ServerArgs.tokenizer_metrics_allowed_customer_labels,
1348
- help="The customer labels allowed for tokenizer metrics. The labels are specified via a dict in "
2068
+ default=ServerArgs.tokenizer_metrics_allowed_custom_labels,
2069
+ help="The custom labels allowed for tokenizer metrics. The labels are specified via a dict in "
1349
2070
  "'--tokenizer-metrics-custom-labels-header' field in HTTP requests, e.g., {'label1': 'value1', 'label2': "
1350
- "'value2'} is allowed if '--tokenizer-metrics-allowed-labels label1 label2' is set.",
2071
+ "'value2'} is allowed if '--tokenizer-metrics-allowed-custom-labels label1 label2' is set.",
1351
2072
  )
1352
2073
  parser.add_argument(
1353
2074
  "--bucket-time-to-first-token",
@@ -1379,8 +2100,8 @@ class ServerArgs:
1379
2100
  bucket_rule = (
1380
2101
  "Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
1381
2102
  "generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
1382
- "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> "
1383
- "<value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500')."
2103
+ "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'custom <value1> "
2104
+ "<value2> ...' uses custom bucket values (e.g., 'custom 10 50 100 500')."
1384
2105
  )
1385
2106
  parser.add_argument(
1386
2107
  "--prompt-tokens-buckets",
@@ -1489,6 +2210,16 @@ class ServerArgs:
1489
2210
  default=ServerArgs.tool_call_parser,
1490
2211
  help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
1491
2212
  )
2213
+ parser.add_argument(
2214
+ "--sampling-defaults",
2215
+ type=str,
2216
+ choices=["openai", "model"],
2217
+ default=ServerArgs.sampling_defaults,
2218
+ help="Where to get default sampling parameters. "
2219
+ "'openai' uses SGLang/OpenAI defaults (temperature=1.0, top_p=1.0, etc.). "
2220
+ "'model' uses the model's generation_config.json to get the recommended "
2221
+ "sampling parameters if available. Default is 'model'.",
2222
+ )
1492
2223
  parser.add_argument(
1493
2224
  "--tool-server",
1494
2225
  type=str,
@@ -1598,12 +2329,27 @@ class ServerArgs:
1598
2329
  default=ServerArgs.max_loaded_loras,
1599
2330
  help="If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`.",
1600
2331
  )
2332
+ parser.add_argument(
2333
+ "--lora-eviction-policy",
2334
+ type=str,
2335
+ default=DEFAULT_LORA_EVICTION_POLICY,
2336
+ choices=["lru", "fifo"],
2337
+ help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
2338
+ )
1601
2339
  parser.add_argument(
1602
2340
  "--lora-backend",
1603
2341
  type=str,
1604
- default="triton",
2342
+ choices=LORA_BACKEND_CHOICES,
2343
+ default=ServerArgs.lora_backend,
1605
2344
  help="Choose the kernel backend for multi-LoRA serving.",
1606
2345
  )
2346
+ parser.add_argument(
2347
+ "--max-lora-chunk-size",
2348
+ type=int,
2349
+ default=ServerArgs.max_lora_chunk_size,
2350
+ choices=[16, 32, 64, 128],
2351
+ help="Maximum chunk size for the ChunkedSGMV LoRA backend. Only used when --lora-backend is 'csgmv'. Choosing a larger value might improve performance.",
2352
+ )
1607
2353
 
1608
2354
  # Kernel backend
1609
2355
  parser.add_argument(
@@ -1644,16 +2390,29 @@ class ServerArgs:
1644
2390
  parser.add_argument(
1645
2391
  "--mm-attention-backend",
1646
2392
  type=str,
1647
- choices=["sdpa", "fa3", "triton_attn"],
2393
+ choices=["sdpa", "fa3", "triton_attn", "ascend_attn"],
1648
2394
  default=ServerArgs.mm_attention_backend,
1649
2395
  help="Set multimodal attention backend.",
1650
2396
  )
2397
+ parser.add_argument(
2398
+ "--nsa-prefill-backend",
2399
+ default=ServerArgs.nsa_prefill_backend,
2400
+ type=str,
2401
+ choices=NSA_CHOICES,
2402
+ )
2403
+ parser.add_argument(
2404
+ "--nsa-decode-backend",
2405
+ default=ServerArgs.nsa_decode_backend,
2406
+ type=str,
2407
+ choices=NSA_CHOICES,
2408
+ )
1651
2409
 
1652
2410
  # Speculative decoding
2411
+ parser.add_argument("--enable-beta-spec", action="store_true")
1653
2412
  parser.add_argument(
1654
2413
  "--speculative-algorithm",
1655
2414
  type=str,
1656
- choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
2415
+ choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE", "NGRAM"],
1657
2416
  help="Speculative algorithm.",
1658
2417
  )
1659
2418
  parser.add_argument(
@@ -1670,6 +2429,15 @@ class ServerArgs:
1670
2429
  "name, a tag name, or a commit id. If unspecified, will use "
1671
2430
  "the default version.",
1672
2431
  )
2432
+ parser.add_argument(
2433
+ "--speculative-draft-load-format",
2434
+ type=str,
2435
+ default=ServerArgs.speculative_draft_load_format,
2436
+ choices=LOAD_FORMAT_CHOICES,
2437
+ help="The format of the draft model weights to load. "
2438
+ "If not specified, will use the same format as --load-format. "
2439
+ "Use 'dummy' to initialize draft model weights with random values for profiling.",
2440
+ )
1673
2441
  parser.add_argument(
1674
2442
  "--speculative-num-steps",
1675
2443
  type=int,
@@ -1713,6 +2481,50 @@ class ServerArgs:
1713
2481
  help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
1714
2482
  default=ServerArgs.speculative_attention_mode,
1715
2483
  )
2484
+ # Ngram speculative decoding
2485
+ parser.add_argument(
2486
+ "--speculative-ngram-min-match-window-size",
2487
+ type=int,
2488
+ default=ServerArgs.speculative_ngram_min_match_window_size,
2489
+ help="The minimum window size for pattern matching in ngram speculative decoding.",
2490
+ )
2491
+ parser.add_argument(
2492
+ "--speculative-ngram-max-match-window-size",
2493
+ type=int,
2494
+ default=ServerArgs.speculative_ngram_max_match_window_size,
2495
+ help="The maximum window size for pattern matching in ngram speculative decoding.",
2496
+ )
2497
+ parser.add_argument(
2498
+ "--speculative-ngram-min-bfs-breadth",
2499
+ type=int,
2500
+ default=ServerArgs.speculative_ngram_min_bfs_breadth,
2501
+ help="The minimum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
2502
+ )
2503
+ parser.add_argument(
2504
+ "--speculative-ngram-max-bfs-breadth",
2505
+ type=int,
2506
+ default=ServerArgs.speculative_ngram_max_bfs_breadth,
2507
+ help="The maximum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
2508
+ )
2509
+ parser.add_argument(
2510
+ "--speculative-ngram-match-type",
2511
+ type=str,
2512
+ choices=["BFS", "PROB"],
2513
+ default=ServerArgs.speculative_ngram_match_type,
2514
+ help="The match type for cache tree.",
2515
+ )
2516
+ parser.add_argument(
2517
+ "--speculative-ngram-branch-length",
2518
+ type=int,
2519
+ default=ServerArgs.speculative_ngram_branch_length,
2520
+ help="The branch length for ngram speculative decoding.",
2521
+ )
2522
+ parser.add_argument(
2523
+ "--speculative-ngram-capacity",
2524
+ type=int,
2525
+ default=ServerArgs.speculative_ngram_capacity,
2526
+ help="The cache capacity for ngram speculative decoding.",
2527
+ )
1716
2528
 
1717
2529
  # Expert parallelism
1718
2530
  parser.add_argument(
@@ -1726,22 +2538,14 @@ class ServerArgs:
1726
2538
  parser.add_argument(
1727
2539
  "--moe-a2a-backend",
1728
2540
  type=str,
1729
- choices=["none", "deepep"],
2541
+ choices=["none", "deepep", "mooncake"],
1730
2542
  default=ServerArgs.moe_a2a_backend,
1731
2543
  help="Choose the backend for MoE A2A.",
1732
2544
  )
1733
2545
  parser.add_argument(
1734
2546
  "--moe-runner-backend",
1735
2547
  type=str,
1736
- choices=[
1737
- "auto",
1738
- "triton",
1739
- "triton_kernel",
1740
- "flashinfer_trtllm",
1741
- "flashinfer_cutlass",
1742
- "flashinfer_mxfp4",
1743
- "flashinfer_cutedsl",
1744
- ],
2548
+ choices=MOE_RUNNER_BACKEND_CHOICES,
1745
2549
  default=ServerArgs.moe_runner_backend,
1746
2550
  help="Choose the runner backend for MoE.",
1747
2551
  )
@@ -1855,6 +2659,12 @@ class ServerArgs:
1855
2659
  choices=["float32", "bfloat16"],
1856
2660
  help="The data type of the SSM states in mamba cache.",
1857
2661
  )
2662
+ parser.add_argument(
2663
+ "--mamba-full-memory-ratio",
2664
+ type=float,
2665
+ default=ServerArgs.mamba_full_memory_ratio,
2666
+ help="The ratio of mamba state memory to full kv cache memory.",
2667
+ )
1858
2668
 
1859
2669
  # Hierarchical cache
1860
2670
  parser.add_argument(
@@ -1881,6 +2691,13 @@ class ServerArgs:
1881
2691
  default=ServerArgs.hicache_write_policy,
1882
2692
  help="The write policy of hierarchical cache.",
1883
2693
  )
2694
+ parser.add_argument(
2695
+ "--radix-eviction-policy",
2696
+ type=str,
2697
+ choices=RADIX_EVICTION_POLICY_CHOICES,
2698
+ default=ServerArgs.radix_eviction_policy,
2699
+ help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
2700
+ )
1884
2701
  parser.add_argument(
1885
2702
  "--hicache-io-backend",
1886
2703
  type=str,
@@ -1898,9 +2715,12 @@ class ServerArgs:
1898
2715
  parser.add_argument(
1899
2716
  "--hicache-storage-backend",
1900
2717
  type=str,
1901
- choices=["file", "mooncake", "hf3fs", "nixl"],
2718
+ choices=["file", "mooncake", "hf3fs", "nixl", "aibrix", "dynamic", "eic"],
1902
2719
  default=ServerArgs.hicache_storage_backend,
1903
- help="The storage backend for hierarchical KV cache.",
2720
+ help="The storage backend for hierarchical KV cache. "
2721
+ "Built-in backends: file, mooncake, hf3fs, nixl, aibrix. "
2722
+ "For dynamic backend, use --hicache-storage-backend-extra-config to specify: "
2723
+ "backend_name (custom name), module_path (Python module path), class_name (backend class name).",
1904
2724
  )
1905
2725
  parser.add_argument(
1906
2726
  "--hicache-storage-prefetch-policy",
@@ -1922,6 +2742,35 @@ class ServerArgs:
1922
2742
  help="Using LMCache as an alternative hierarchical cache solution",
1923
2743
  )
1924
2744
 
2745
+ # Ktransformer server args
2746
+ parser.add_argument(
2747
+ "--kt-amx-weight-path",
2748
+ type=str,
2749
+ help="[ktransformers parameter] The path of the quantized expert weights for amx kernel. A local folder.",
2750
+ )
2751
+ parser.add_argument(
2752
+ "--kt-amx-method",
2753
+ type=str,
2754
+ default="AMXINT4",
2755
+ help="[ktransformers parameter] Quantization formats for CPU execution.",
2756
+ )
2757
+ parser.add_argument(
2758
+ "--kt-cpuinfer",
2759
+ type=int,
2760
+ help="[ktransformers parameter] The number of CPUInfer threads.",
2761
+ )
2762
+ parser.add_argument(
2763
+ "--kt-threadpool-count",
2764
+ type=int,
2765
+ default=2,
2766
+ help="[ktransformers parameter] One-to-one with the number of NUMA nodes (one thread pool per NUMA).",
2767
+ )
2768
+ parser.add_argument(
2769
+ "--kt-num-gpu-experts",
2770
+ type=int,
2771
+ help="[ktransformers parameter] The number of GPU experts.",
2772
+ )
2773
+
1925
2774
  # Double Sparsity
1926
2775
  parser.add_argument(
1927
2776
  "--enable-double-sparsity",
@@ -1991,6 +2840,14 @@ class ServerArgs:
1991
2840
  help="Mode of offloading.",
1992
2841
  )
1993
2842
 
2843
+ # Args for multi-item-scoring
2844
+ parser.add_argument(
2845
+ "--multi-item-scoring-delimiter",
2846
+ type=int,
2847
+ default=ServerArgs.multi_item_scoring_delimiter,
2848
+ help="Delimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: Query<delimiter>Item1<delimiter>Item2<delimiter>... This enables efficient batch processing of multiple items against a single query.",
2849
+ )
2850
+
1994
2851
  # Optimization/debug options
1995
2852
  parser.add_argument(
1996
2853
  "--disable-radix-cache",
@@ -2049,6 +2906,11 @@ class ServerArgs:
2049
2906
  action="store_true",
2050
2907
  help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
2051
2908
  )
2909
+ parser.add_argument(
2910
+ "--disable-tokenizer-batch-decode",
2911
+ action="store_true",
2912
+ help="Disable batch decoding when decoding multiple completions.",
2913
+ )
2052
2914
  parser.add_argument(
2053
2915
  "--disable-outlines-disk-cache",
2054
2916
  action="store_true",
@@ -2064,6 +2926,11 @@ class ServerArgs:
2064
2926
  action="store_true",
2065
2927
  help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
2066
2928
  )
2929
+ parser.add_argument(
2930
+ "--enable-torch-symm-mem",
2931
+ action="store_true",
2932
+ help="Enable using torch symm mem for all-reduce kernel and fall back to NCCL. Only supports CUDA device SM90 and above. SM90 supports world size 4, 6, 8. SM10 supports world size 6, 8.",
2933
+ )
2067
2934
  parser.add_argument(
2068
2935
  "--disable-overlap-schedule",
2069
2936
  action="store_true",
@@ -2089,6 +2956,11 @@ class ServerArgs:
2089
2956
  action="store_true",
2090
2957
  help="Enabling two micro batches to overlap.",
2091
2958
  )
2959
+ parser.add_argument(
2960
+ "--enable-single-batch-overlap",
2961
+ action="store_true",
2962
+ help="Let computation and communication overlap within one micro batch.",
2963
+ )
2092
2964
  parser.add_argument(
2093
2965
  "--tbo-token-distribution-threshold",
2094
2966
  type=float,
@@ -2100,12 +2972,36 @@ class ServerArgs:
2100
2972
  action="store_true",
2101
2973
  help="Optimize the model with torch.compile. Experimental feature.",
2102
2974
  )
2975
+ parser.add_argument(
2976
+ "--enable-piecewise-cuda-graph",
2977
+ action="store_true",
2978
+ help="Optimize the model with piecewise cuda graph for extend/prefill only. Experimental feature.",
2979
+ )
2980
+ parser.add_argument(
2981
+ "--piecewise-cuda-graph-tokens",
2982
+ type=json_list_type,
2983
+ default=ServerArgs.piecewise_cuda_graph_tokens,
2984
+ help="Set the list of tokens when using piecewise cuda graph.",
2985
+ )
2986
+ parser.add_argument(
2987
+ "--piecewise-cuda-graph-compiler",
2988
+ type=str,
2989
+ default=ServerArgs.piecewise_cuda_graph_compiler,
2990
+ help="Set the compiler for piecewise cuda graph. Choices are: eager, inductor.",
2991
+ choices=["eager", "inductor"],
2992
+ )
2103
2993
  parser.add_argument(
2104
2994
  "--torch-compile-max-bs",
2105
2995
  type=int,
2106
2996
  default=ServerArgs.torch_compile_max_bs,
2107
2997
  help="Set the maximum batch size when using torch compile.",
2108
2998
  )
2999
+ parser.add_argument(
3000
+ "--piecewise-cuda-graph-max-tokens",
3001
+ type=int,
3002
+ default=ServerArgs.piecewise_cuda_graph_max_tokens,
3003
+ help="Set the maximum tokens when using piecewise cuda graph.",
3004
+ )
2109
3005
  parser.add_argument(
2110
3006
  "--torchao-config",
2111
3007
  type=str,
@@ -2158,6 +3054,11 @@ class ServerArgs:
2158
3054
  action="store_true",
2159
3055
  help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
2160
3056
  )
3057
+ parser.add_argument(
3058
+ "--enable-weights-cpu-backup",
3059
+ action="store_true",
3060
+ help="Save model weights to CPU memory during release_weights_occupation and resume_weights_occupation",
3061
+ )
2161
3062
  parser.add_argument(
2162
3063
  "--allow-auto-truncate",
2163
3064
  action="store_true",
@@ -2188,6 +3089,11 @@ class ServerArgs:
2188
3089
  action="store_true",
2189
3090
  help="Adopt base image processor instead of fast image processor.",
2190
3091
  )
3092
+ parser.add_argument(
3093
+ "--keep-mm-feature-on-device",
3094
+ action="store_true",
3095
+ help="Keep multimodal feature tensors on device after processing to save D2H copy.",
3096
+ )
2191
3097
  parser.add_argument(
2192
3098
  "--enable-return-hidden-states",
2193
3099
  action="store_true",
@@ -2225,11 +3131,6 @@ class ServerArgs:
2225
3131
  default=ServerArgs.debug_tensor_dump_inject,
2226
3132
  help="Inject the outputs from jax as the input of every layer.",
2227
3133
  )
2228
- parser.add_argument(
2229
- "--debug-tensor-dump-prefill-only",
2230
- action="store_true",
2231
- help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
2232
- )
2233
3134
  parser.add_argument(
2234
3135
  "--enable-dynamic-batch-tokenizer",
2235
3136
  action="store_true",
@@ -2295,6 +3196,11 @@ class ServerArgs:
2295
3196
  "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
2296
3197
  "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
2297
3198
  )
3199
+ parser.add_argument(
3200
+ "--disaggregation-decode-enable-offload-kvcache",
3201
+ action="store_true",
3202
+ help="Enable async KV cache offloading on decode server (PD mode).",
3203
+ )
2298
3204
  parser.add_argument(
2299
3205
  "--num-reserved-decode-tokens",
2300
3206
  type=int,
@@ -2321,6 +3227,24 @@ class ServerArgs:
2321
3227
  action="store_true",
2322
3228
  help="Disable mmap while loading weight using safetensors.",
2323
3229
  )
3230
+ parser.add_argument(
3231
+ "--remote-instance-weight-loader-seed-instance-ip",
3232
+ type=str,
3233
+ default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
3234
+ help="The ip of the seed instance for loading weights from remote instance.",
3235
+ )
3236
+ parser.add_argument(
3237
+ "--remote-instance-weight-loader-seed-instance-service-port",
3238
+ type=int,
3239
+ default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
3240
+ help="The service port of the seed instance for loading weights from remote instance.",
3241
+ )
3242
+ parser.add_argument(
3243
+ "--remote-instance-weight-loader-send-weights-group-ports",
3244
+ type=json_list_type,
3245
+ default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
3246
+ help="The communication group ports for loading weights from remote instance.",
3247
+ )
2324
3248
 
2325
3249
  # For PD-Multiplexing
2326
3250
  parser.add_argument(
@@ -2328,6 +3252,12 @@ class ServerArgs:
2328
3252
  action="store_true",
2329
3253
  help="Enable PD-Multiplexing, PD running on greenctx stream.",
2330
3254
  )
3255
+ parser.add_argument(
3256
+ "--pdmux-config-path",
3257
+ type=str,
3258
+ default=None,
3259
+ help="The path of the PD-Multiplexing config file.",
3260
+ )
2331
3261
 
2332
3262
  parser.add_argument(
2333
3263
  "--sm-group-num",
@@ -2336,41 +3266,55 @@ class ServerArgs:
2336
3266
  help="Number of sm partition groups.",
2337
3267
  )
2338
3268
 
3269
+ # For deterministic inference
3270
+ parser.add_argument(
3271
+ "--enable-deterministic-inference",
3272
+ action="store_true",
3273
+ help="Enable deterministic inference mode with batch invariant ops.",
3274
+ )
3275
+
2339
3276
  # Deprecated arguments
2340
3277
  parser.add_argument(
2341
3278
  "--enable-ep-moe",
2342
- action="store_true",
2343
- help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
3279
+ action=DeprecatedAction,
3280
+ help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
2344
3281
  )
2345
3282
  parser.add_argument(
2346
3283
  "--enable-deepep-moe",
2347
- action="store_true",
2348
- help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
3284
+ action=DeprecatedAction,
3285
+ help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
2349
3286
  )
2350
3287
  parser.add_argument(
2351
3288
  "--enable-flashinfer-cutlass-moe",
2352
- action="store_true",
2353
- help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
3289
+ action=DeprecatedAction,
3290
+ help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
2354
3291
  )
2355
3292
  parser.add_argument(
2356
3293
  "--enable-flashinfer-cutedsl-moe",
2357
- action="store_true",
2358
- help="(Deprecated) Enable FlashInfer CuteDSL MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
3294
+ action=DeprecatedAction,
3295
+ help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
2359
3296
  )
2360
3297
  parser.add_argument(
2361
3298
  "--enable-flashinfer-trtllm-moe",
2362
- action="store_true",
2363
- help="(Deprecated) Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
3299
+ action=DeprecatedAction,
3300
+ help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
2364
3301
  )
2365
3302
  parser.add_argument(
2366
3303
  "--enable-triton-kernel-moe",
2367
- action="store_true",
2368
- help="(Deprecated) Use triton moe grouped gemm kernel.",
3304
+ action=DeprecatedAction,
3305
+ help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
2369
3306
  )
2370
3307
  parser.add_argument(
2371
3308
  "--enable-flashinfer-mxfp4-moe",
2372
- action="store_true",
2373
- help="(Deprecated) Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
3309
+ action=DeprecatedAction,
3310
+ help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
3311
+ )
3312
+
3313
+ # Configuration file support
3314
+ parser.add_argument(
3315
+ "--config",
3316
+ type=str,
3317
+ help="Read CLI options from a config file. Must be a YAML file with configuration options.",
2374
3318
  )
2375
3319
 
2376
3320
  @classmethod
@@ -2395,7 +3339,7 @@ class ServerArgs:
2395
3339
  self.model_path,
2396
3340
  trust_remote_code=self.trust_remote_code,
2397
3341
  revision=self.revision,
2398
- model_override_args=json.loads(self.json_model_override_args),
3342
+ model_override_args=orjson.loads(self.json_model_override_args),
2399
3343
  **kwargs,
2400
3344
  )
2401
3345
  return hf_config
@@ -2442,7 +3386,34 @@ class ServerArgs:
2442
3386
  self.chunked_prefill_size % self.page_size == 0
2443
3387
  ), "chunked_prefill_size must be divisible by page_size"
2444
3388
 
2445
- # Check multi tokenizer
3389
+ # Check pdmux
3390
+ if self.enable_pdmux:
3391
+ assert (
3392
+ self.pp_size == 1
3393
+ ), "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)."
3394
+ assert (
3395
+ self.chunked_prefill_size == -1
3396
+ ), "PD-Multiplexing is not compatible with chunked prefill."
3397
+ assert (
3398
+ self.disaggregation_mode == "null"
3399
+ ), "PD-Multiplexing is not compatible with disaggregation mode."
3400
+ assert (
3401
+ self.disable_overlap_schedule
3402
+ ), "PD-Multiplexing is not compatible with overlap schedule."
3403
+
3404
+ # NOTE: CUDA Green Context may encounter potential issues with CudaGraph on torch 2.7.x – 2.8.x, leading to performance degradation.
3405
+ import torch
3406
+
3407
+ parts = torch.__version__.split("+", 1)[0].split(".")
3408
+ major = int(parts[0]) if len(parts) > 0 and parts[0].isdigit() else 0
3409
+ minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
3410
+ if (major, minor) > (2, 6):
3411
+ logger.warning(
3412
+ "WARNING: PD-Multiplexing may experience performance degradation with torch versions > 2.6.x.\n"
3413
+ f" Current torch version is {torch.__version__}.\n"
3414
+ " Please manually install torch 2.6.x."
3415
+ )
3416
+
2446
3417
  assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
2447
3418
  self.validate_buckets_rule(
2448
3419
  "--prompt-tokens-buckets", self.prompt_tokens_buckets
@@ -2451,6 +3422,24 @@ class ServerArgs:
2451
3422
  "--generation-tokens-buckets", self.generation_tokens_buckets
2452
3423
  )
2453
3424
 
3425
+ # Check scheduling policy
3426
+ if self.enable_priority_scheduling:
3427
+ assert self.schedule_policy in [
3428
+ "fcfs",
3429
+ "lof",
3430
+ ], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
3431
+
3432
+ # Check multi-item scoring
3433
+ if self.multi_item_scoring_delimiter is not None:
3434
+ assert self.disable_radix_cache, (
3435
+ "Multi-item scoring requires radix cache to be disabled. "
3436
+ "Please set --disable-radix-cache when using --multi-item-scoring-delimiter."
3437
+ )
3438
+ assert self.chunked_prefill_size == -1, (
3439
+ "Multi-item scoring requires chunked prefill to be disabled. "
3440
+ "Please set --chunked-prefill-size -1 when using --multi-item-scoring-delimiter."
3441
+ )
3442
+
2454
3443
  def check_lora_server_args(self):
2455
3444
  assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
2456
3445
 
@@ -2534,6 +3523,12 @@ class ServerArgs:
2534
3523
  f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
2535
3524
  )
2536
3525
 
3526
+ if self.max_lora_chunk_size is not None:
3527
+ assert (
3528
+ 16 <= self.max_lora_chunk_size <= 128
3529
+ and (self.max_lora_chunk_size & (self.max_lora_chunk_size - 1)) == 0
3530
+ ), "--max-lora-chunk-size must be a power of 2 between 16 and 128."
3531
+
2537
3532
  def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
2538
3533
  larger_tp = max(decode_tp, prefill_tp)
2539
3534
  smaller_tp = min(decode_tp, prefill_tp)
@@ -2551,8 +3546,8 @@ class ServerArgs:
2551
3546
  assert rule in [
2552
3547
  "tse",
2553
3548
  "default",
2554
- "customer",
2555
- ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'customer'"
3549
+ "custom",
3550
+ ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'custom'"
2556
3551
 
2557
3552
  if rule == "tse":
2558
3553
  assert (
@@ -2575,95 +3570,20 @@ class ServerArgs:
2575
3570
  len(buckets_rule) == 1
2576
3571
  ), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
2577
3572
 
2578
- elif rule == "customer":
3573
+ elif rule == "custom":
2579
3574
  assert (
2580
3575
  len(buckets_rule) >= 2
2581
- ), f"{arg_name} customer rule requires at least one bucket value: ['customer', value1, ...]"
3576
+ ), f"{arg_name} custom rule requires at least one bucket value: ['custom', value1, ...]"
2582
3577
  try:
2583
3578
  bucket_values = [float(x) for x in buckets_rule[1:]]
2584
3579
  except ValueError:
2585
- assert False, f"{arg_name} customer rule bucket values must be numeric"
3580
+ assert False, f"{arg_name} custom rule bucket values must be numeric"
2586
3581
  assert len(set(bucket_values)) == len(
2587
3582
  bucket_values
2588
- ), f"{arg_name} customer rule bucket values should not contain duplicates"
3583
+ ), f"{arg_name} custom rule bucket values should not contain duplicates"
2589
3584
  assert all(
2590
3585
  val >= 0 for val in bucket_values
2591
- ), f"{arg_name} customer rule bucket values should be non-negative"
2592
-
2593
- def model_specific_adjustments(self):
2594
- hf_config = self.get_hf_config()
2595
- model_arch = hf_config.architectures[0]
2596
- if model_arch in ["GptOssForCausalLM"]:
2597
- if self.attention_backend is None:
2598
- if is_cuda() and is_sm100_supported():
2599
- self.attention_backend = "trtllm_mha"
2600
- elif is_cuda() and is_sm90_supported():
2601
- self.attention_backend = "fa3"
2602
- else:
2603
- self.attention_backend = "triton"
2604
- supported_backends = ["triton", "trtllm_mha", "fa3"]
2605
- logger.info(
2606
- f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
2607
- )
2608
- assert (
2609
- self.attention_backend in supported_backends
2610
- ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
2611
-
2612
- if is_sm100_supported():
2613
- if not self.enable_dp_attention:
2614
- self.enable_flashinfer_allreduce_fusion = True
2615
- logger.info(
2616
- "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
2617
- )
2618
- quantization_config = getattr(hf_config, "quantization_config", None)
2619
- is_mxfp4_quant_format = (
2620
- quantization_config is not None
2621
- and quantization_config.get("quant_method") == "mxfp4"
2622
- )
2623
-
2624
- if is_sm100_supported() and is_mxfp4_quant_format:
2625
- self.moe_runner_backend = "flashinfer_mxfp4"
2626
- logger.warning(
2627
- "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
2628
- )
2629
- else:
2630
- if self.moe_runner_backend == "triton_kernel":
2631
- assert (
2632
- self.ep_size == 1
2633
- ), "Triton kernel MoE is only supported when ep_size == 1"
2634
- if (
2635
- self.moe_runner_backend == "auto"
2636
- and self.ep_size == 1
2637
- and is_triton_kernels_available()
2638
- ):
2639
- self.moe_runner_backend = "triton_kernel"
2640
- logger.warning(
2641
- "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
2642
- )
2643
- self.disable_hybrid_swa_memory = True
2644
- if is_mxfp4_quant_format:
2645
- # use bf16 for mxfp4 triton kernels
2646
- self.dtype = "bfloat16"
2647
-
2648
- elif "Llama4" in model_arch:
2649
- assert self.attention_backend in {
2650
- "fa3",
2651
- "aiter",
2652
- "triton",
2653
- }, "fa3, aiter, or triton is required for Llama4 model"
2654
- elif model_arch in [
2655
- "Gemma2ForCausalLM",
2656
- "Gemma3ForCausalLM",
2657
- "Gemma3ForConditionalGeneration",
2658
- "Gemma3nForCausalLM",
2659
- "Gemma3nForConditionalGeneration",
2660
- ]:
2661
- # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
2662
- # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
2663
- logger.warning(
2664
- f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
2665
- )
2666
- self.disable_hybrid_swa_memory = True
3586
+ ), f"{arg_name} custom rule bucket values should be non-negative"
2667
3587
 
2668
3588
  def adjust_mem_fraction_for_vlm(self, model_config):
2669
3589
  vision_config = getattr(model_config.hf_config, "vision_config", None)
@@ -2704,6 +3624,22 @@ class ServerArgs:
2704
3624
  )
2705
3625
 
2706
3626
 
3627
+ # NOTE: This is a global variable to hold the server args for scheduler.
3628
+ _global_server_args: Optional[ServerArgs] = None
3629
+
3630
+
3631
+ def set_global_server_args_for_scheduler(server_args: ServerArgs):
3632
+ global _global_server_args
3633
+ _global_server_args = server_args
3634
+
3635
+
3636
+ def get_global_server_args() -> ServerArgs:
3637
+ if _global_server_args is None:
3638
+ raise ValueError("Global server args is not set yet!")
3639
+
3640
+ return _global_server_args
3641
+
3642
+
2707
3643
  def prepare_server_args(argv: List[str]) -> ServerArgs:
2708
3644
  """
2709
3645
  Prepare the server arguments from the command line arguments.
@@ -2715,14 +3651,35 @@ def prepare_server_args(argv: List[str]) -> ServerArgs:
2715
3651
  Returns:
2716
3652
  The server arguments.
2717
3653
  """
3654
+ # Import here to avoid circular imports
3655
+ from sglang.srt.server_args_config_parser import ConfigArgumentMerger
3656
+
3657
+ # Check for config file and merge arguments if present
3658
+ if "--config" in argv:
3659
+ # Extract boolean actions from the parser to handle them correctly
3660
+ parser = argparse.ArgumentParser()
3661
+ ServerArgs.add_cli_args(parser)
3662
+
3663
+ # Get boolean action destinations
3664
+ boolean_actions = []
3665
+ for action in parser._actions:
3666
+ if hasattr(action, "dest") and hasattr(action, "action"):
3667
+ if action.action in ["store_true", "store_false"]:
3668
+ boolean_actions.append(action.dest)
3669
+
3670
+ # Merge config file arguments with CLI arguments
3671
+ config_merger = ConfigArgumentMerger(boolean_actions=boolean_actions)
3672
+ argv = config_merger.merge_config_with_args(argv)
3673
+
2718
3674
  parser = argparse.ArgumentParser()
2719
3675
  ServerArgs.add_cli_args(parser)
2720
3676
  raw_args = parser.parse_args(argv)
2721
- server_args = ServerArgs.from_cli_args(raw_args)
2722
- return server_args
3677
+
3678
+ return ServerArgs.from_cli_args(raw_args)
2723
3679
 
2724
3680
 
2725
3681
  ZMQ_TCP_PORT_DELTA = 233
3682
+ DP_ATTENTION_HANDSHAKE_PORT_DELTA = 5
2726
3683
 
2727
3684
 
2728
3685
  @dataclasses.dataclass
@@ -2747,7 +3704,11 @@ class PortArgs:
2747
3704
  tokenizer_worker_ipc_name: Optional[str]
2748
3705
 
2749
3706
  @staticmethod
2750
- def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
3707
+ def init_new(
3708
+ server_args: ServerArgs,
3709
+ dp_rank: Optional[int] = None,
3710
+ worker_ports: Optional[List[int]] = None,
3711
+ ) -> PortArgs:
2751
3712
  if server_args.nccl_port is None:
2752
3713
  nccl_port = server_args.port + random.randint(100, 1000)
2753
3714
  while True:
@@ -2794,8 +3755,8 @@ class PortArgs:
2794
3755
  # TokenizerManager to DataParallelController
2795
3756
  scheduler_input_port = port_base + 4
2796
3757
  else:
2797
- scheduler_input_port = port_base + 4 + 1 + dp_rank
2798
-
3758
+ assert worker_ports is not None
3759
+ scheduler_input_port = worker_ports[dp_rank]
2799
3760
  return PortArgs(
2800
3761
  tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
2801
3762
  scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
@@ -2856,6 +3817,7 @@ def auto_choose_speculative_params(self: ServerArgs):
2856
3817
  # The default value for llama
2857
3818
  return (5, 4, 8)
2858
3819
  elif arch in [
3820
+ "DeepseekV32ForCausalLM",
2859
3821
  "DeepseekV3ForCausalLM",
2860
3822
  "DeepseekV2ForCausalLM",
2861
3823
  "GptOssForCausalLM",