sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -13,6 +13,8 @@
13
13
  # ==============================================================================
14
14
  """The arguments of the server."""
15
15
 
16
+ from __future__ import annotations
17
+
16
18
  import argparse
17
19
  import dataclasses
18
20
  import json
@@ -20,37 +22,48 @@ import logging
20
22
  import os
21
23
  import random
22
24
  import tempfile
23
- from typing import List, Literal, Optional, Union
25
+ from typing import Dict, List, Literal, Optional, Union
26
+
27
+ import orjson
24
28
 
25
29
  from sglang.srt.connector import ConnectorType
30
+ from sglang.srt.environ import envs
26
31
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
27
32
  from sglang.srt.lora.lora_registry import LoRARef
28
33
  from sglang.srt.parser.reasoning_parser import ReasoningParser
29
- from sglang.srt.utils import (
34
+ from sglang.srt.utils.common import (
30
35
  LORA_TARGET_ALL_MODULES,
31
36
  SUPPORTED_LORA_TARGET_MODULES,
32
37
  configure_ipv6,
38
+ cpu_has_amx_support,
33
39
  get_device,
34
40
  get_device_memory_capacity,
41
+ get_device_sm,
35
42
  is_cuda,
43
+ is_fa3_default_architecture,
36
44
  is_flashinfer_available,
37
45
  is_hip,
46
+ is_hopper_with_cuda_12_3,
47
+ is_no_spec_infer_or_topk_one,
38
48
  is_npu,
39
49
  is_port_available,
40
50
  is_remote_url,
41
51
  is_sm90_supported,
42
52
  is_sm100_supported,
53
+ is_sm120_supported,
43
54
  is_triton_kernels_available,
44
55
  is_valid_ipv6_address,
45
56
  json_list_type,
46
57
  nullable_str,
47
58
  parse_connector_type,
59
+ xpu_has_xmx_support,
48
60
  )
49
61
  from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
50
62
  from sglang.utils import is_in_ci
51
63
 
52
64
  logger = logging.getLogger(__name__)
53
65
 
66
+
54
67
  # Define constants
55
68
  LOAD_FORMAT_CHOICES = [
56
69
  "auto",
@@ -76,6 +89,7 @@ QUANTIZATION_CHOICES = [
76
89
  "bitsandbytes",
77
90
  "gguf",
78
91
  "modelopt",
92
+ "modelopt_fp8",
79
93
  "modelopt_fp4",
80
94
  "petit_nvfp4",
81
95
  "w8a8_int8",
@@ -84,6 +98,7 @@ QUANTIZATION_CHOICES = [
84
98
  "qoq",
85
99
  "w4afp8",
86
100
  "mxfp4",
101
+ "compressed-tensors", # for Ktransformers
87
102
  ]
88
103
 
89
104
  ATTENTION_BACKEND_CHOICES = [
@@ -107,6 +122,7 @@ ATTENTION_BACKEND_CHOICES = [
107
122
  # Other platforms
108
123
  "intel_amx",
109
124
  "ascend",
125
+ "intel_xpu",
110
126
  ]
111
127
 
112
128
  LORA_BACKEND_CHOICES = ["triton", "csgmv"]
@@ -117,10 +133,22 @@ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
117
133
 
118
134
  DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
119
135
 
120
- NSA_CHOICES = ["flashmla_prefill", "flashmla_decode", "fa3", "tilelang", "aiter"]
136
+ NSA_CHOICES = ["flashmla_sparse", "flashmla_kv", "fa3", "tilelang", "aiter"]
121
137
 
122
138
  RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
123
139
 
140
+ MOE_RUNNER_BACKEND_CHOICES = [
141
+ "auto",
142
+ "deep_gemm",
143
+ "triton",
144
+ "triton_kernel",
145
+ "flashinfer_trtllm",
146
+ "flashinfer_cutlass",
147
+ "flashinfer_mxfp4",
148
+ "flashinfer_cutedsl",
149
+ "cutlass",
150
+ ]
151
+
124
152
 
125
153
  # Allow external code to add more choices
126
154
  def add_load_format_choices(choices):
@@ -143,6 +171,10 @@ def add_grammar_backend_choices(choices):
143
171
  GRAMMAR_BACKEND_CHOICES.extend(choices)
144
172
 
145
173
 
174
+ def add_moe_runner_backend_choices(choices):
175
+ MOE_RUNNER_BACKEND_CHOICES.extend(choices)
176
+
177
+
146
178
  def add_deterministic_attention_backend_choices(choices):
147
179
  DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
148
180
 
@@ -153,6 +185,15 @@ def add_radix_eviction_policy_choices(choices):
153
185
 
154
186
  @dataclasses.dataclass
155
187
  class ServerArgs:
188
+ """
189
+ The arguments of the server.
190
+
191
+ NOTE: When you add new arguments, please make sure the order
192
+ in this class definition the same as the order in the the function
193
+ `ServerArgs.add_cli_args`.
194
+ Please follow the existing style to group the new arguments into related groups or create new groups.
195
+ """
196
+
156
197
  # Model and tokenizer
157
198
  model_path: str
158
199
  tokenizer_path: Optional[str] = None
@@ -171,9 +212,11 @@ class ServerArgs:
171
212
  # HTTP server
172
213
  host: str = "127.0.0.1"
173
214
  port: int = 30000
215
+ grpc_mode: bool = False
174
216
  skip_server_warmup: bool = False
175
217
  warmups: Optional[str] = None
176
218
  nccl_port: Optional[int] = None
219
+ checkpoint_engine_wait_weights_before_ready: bool = False
177
220
 
178
221
  # Quantization and data type
179
222
  dtype: str = "auto"
@@ -181,6 +224,11 @@ class ServerArgs:
181
224
  quantization_param_path: Optional[str] = None
182
225
  kv_cache_dtype: str = "auto"
183
226
  enable_fp32_lm_head: bool = False
227
+ modelopt_quant: Optional[Union[str, Dict]] = None
228
+ modelopt_checkpoint_restore_path: Optional[str] = None
229
+ modelopt_checkpoint_save_path: Optional[str] = None
230
+ modelopt_export_path: Optional[str] = None
231
+ quantize_and_serve: bool = False
184
232
 
185
233
  # Memory and scheduling
186
234
  mem_fraction_static: Optional[float] = None
@@ -191,6 +239,7 @@ class ServerArgs:
191
239
  max_prefill_tokens: int = 16384
192
240
  schedule_policy: str = "fcfs"
193
241
  enable_priority_scheduling: bool = False
242
+ abort_on_priority_when_disabled: bool = False
194
243
  schedule_low_priority_values_first: bool = False
195
244
  priority_scheduling_preemption_threshold: int = 10
196
245
  schedule_conservativeness: float = 1.0
@@ -204,11 +253,12 @@ class ServerArgs:
204
253
  device: Optional[str] = None
205
254
  tp_size: int = 1
206
255
  pp_size: int = 1
207
- max_micro_batch_size: Optional[int] = None
256
+ pp_max_micro_batch_size: Optional[int] = None
208
257
  stream_interval: int = 1
209
258
  stream_output: bool = False
210
259
  random_seed: Optional[int] = None
211
260
  constrained_json_whitespace_pattern: Optional[str] = None
261
+ constrained_json_disable_any_whitespace: bool = False
212
262
  watchdog_timeout: float = 300
213
263
  dist_timeout: Optional[int] = None # timeout for torch.distributed
214
264
  download_dir: Optional[str] = None
@@ -233,10 +283,10 @@ class ServerArgs:
233
283
  collect_tokens_histogram: bool = False
234
284
  prompt_tokens_buckets: Optional[List[str]] = None
235
285
  generation_tokens_buckets: Optional[List[str]] = None
286
+ gc_warning_threshold_secs: float = 0.0
236
287
  decode_log_interval: int = 40
237
288
  enable_request_time_stats_logging: bool = False
238
289
  kv_events_config: Optional[str] = None
239
- gc_warning_threshold_secs: float = 0.0
240
290
  enable_trace: bool = False
241
291
  oltp_traces_endpoint: str = "localhost:4317"
242
292
 
@@ -251,6 +301,7 @@ class ServerArgs:
251
301
  reasoning_parser: Optional[str] = None
252
302
  tool_call_parser: Optional[str] = None
253
303
  tool_server: Optional[str] = None
304
+ sampling_defaults: str = "model"
254
305
 
255
306
  # Data parallelism
256
307
  dp_size: int = 1
@@ -277,6 +328,7 @@ class ServerArgs:
277
328
  ] = None
278
329
  max_loaded_loras: Optional[int] = None
279
330
  max_loras_per_batch: int = 8
331
+ lora_eviction_policy: str = "lru"
280
332
  lora_backend: str = "triton"
281
333
  max_lora_chunk_size: Optional[int] = 16
282
334
 
@@ -287,13 +339,14 @@ class ServerArgs:
287
339
  sampling_backend: Optional[str] = None
288
340
  grammar_backend: Optional[str] = None
289
341
  mm_attention_backend: Optional[str] = None
290
- nsa_prefill: str = "flashmla_prefill"
291
- nsa_decode: str = "fa3"
342
+ nsa_prefill_backend: str = "flashmla_sparse"
343
+ nsa_decode_backend: str = "fa3"
292
344
 
293
345
  # Speculative decoding
294
346
  speculative_algorithm: Optional[str] = None
295
347
  speculative_draft_model_path: Optional[str] = None
296
348
  speculative_draft_model_revision: Optional[str] = None
349
+ speculative_draft_load_format: Optional[str] = None
297
350
  speculative_num_steps: Optional[int] = None
298
351
  speculative_eagle_topk: Optional[int] = None
299
352
  speculative_num_draft_tokens: Optional[int] = None
@@ -312,15 +365,8 @@ class ServerArgs:
312
365
 
313
366
  # Expert parallelism
314
367
  ep_size: int = 1
315
- moe_a2a_backend: Literal["none", "deepep"] = "none"
316
- moe_runner_backend: Literal[
317
- "auto",
318
- "triton",
319
- "triton_kernel",
320
- "flashinfer_trtllm",
321
- "flashinfer_cutlass",
322
- "flashinfer_mxfp4",
323
- ] = "auto"
368
+ moe_a2a_backend: Literal["none", "deepep", "mooncake"] = "none"
369
+ moe_runner_backend: str = "auto"
324
370
  flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
325
371
  enable_flashinfer_allreduce_fusion: bool = False
326
372
  deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
@@ -339,10 +385,13 @@ class ServerArgs:
339
385
  enable_expert_distribution_metrics: bool = False
340
386
  deepep_config: Optional[str] = None
341
387
  moe_dense_tp_size: Optional[int] = None
388
+ elastic_ep_backend: Literal[None, "mooncake"] = None
389
+ mooncake_ib_device: Optional[str] = None
342
390
 
343
391
  # Mamba cache
344
392
  max_mamba_cache_size: Optional[int] = None
345
393
  mamba_ssm_dtype: str = "float32"
394
+ mamba_full_memory_ratio: float = 0.9
346
395
 
347
396
  # Hierarchical cache
348
397
  enable_hierarchical_cache: bool = False
@@ -357,6 +406,13 @@ class ServerArgs:
357
406
  # LMCache
358
407
  enable_lmcache: bool = False
359
408
 
409
+ # Ktransformers
410
+ kt_amx_weight_path: Optional[str] = None
411
+ kt_amx_method: Optional[str] = None
412
+ kt_cpuinfer: Optional[int] = None
413
+ kt_threadpool_count: Optional[int] = None
414
+ kt_num_gpu_experts: Optional[int] = None
415
+
360
416
  # Double Sparsity
361
417
  enable_double_sparsity: bool = False
362
418
  ds_channel_config_path: Optional[str] = None
@@ -372,6 +428,12 @@ class ServerArgs:
372
428
  offload_prefetch_step: int = 1
373
429
  offload_mode: str = "cpu"
374
430
 
431
+ # Scoring configuration
432
+ # Delimiter token ID used to combine Query and Items into a single sequence for multi-item scoring.
433
+ # Format: Query<delimiter>Item1<delimiter>Item2<delimiter>...
434
+ # This enables efficient batch processing of multiple items against a single query.
435
+ multi_item_scoring_delimiter: Optional[Union[int]] = None
436
+
375
437
  # Optimization/debug options
376
438
  disable_radix_cache: bool = False
377
439
  cuda_graph_max_bs: Optional[int] = None
@@ -384,6 +446,7 @@ class ServerArgs:
384
446
  enable_symm_mem: bool = False
385
447
  disable_flashinfer_cutlass_moe_fp4_allgather: bool = False
386
448
  enable_tokenizer_batch_encode: bool = False
449
+ disable_tokenizer_batch_decode: bool = False
387
450
  disable_outlines_disk_cache: bool = False
388
451
  disable_custom_all_reduce: bool = False
389
452
  enable_mscclpp: bool = False
@@ -396,7 +459,11 @@ class ServerArgs:
396
459
  enable_single_batch_overlap: bool = False
397
460
  tbo_token_distribution_threshold: float = 0.48
398
461
  enable_torch_compile: bool = False
462
+ enable_piecewise_cuda_graph: bool = False
399
463
  torch_compile_max_bs: int = 32
464
+ piecewise_cuda_graph_max_tokens: int = 4096
465
+ piecewise_cuda_graph_tokens: Optional[List[int]] = None
466
+ piecewise_cuda_graph_compiler: str = "eager"
400
467
  torchao_config: str = ""
401
468
  enable_nan_detection: bool = False
402
469
  enable_p2p_check: bool = False
@@ -418,6 +485,7 @@ class ServerArgs:
418
485
  scheduler_recv_interval: int = 1
419
486
  numa_node: Optional[List[int]] = None
420
487
  enable_deterministic_inference: bool = False
488
+ rl_on_policy_target: Optional[str] = None
421
489
 
422
490
  # Dynamic batch tokenizer
423
491
  enable_dynamic_batch_tokenizer: bool = False
@@ -428,7 +496,6 @@ class ServerArgs:
428
496
  debug_tensor_dump_output_folder: Optional[str] = None
429
497
  debug_tensor_dump_input_file: Optional[str] = None
430
498
  debug_tensor_dump_inject: bool = False
431
- debug_tensor_dump_prefill_only: bool = False
432
499
 
433
500
  # PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
434
501
  disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
@@ -452,12 +519,18 @@ class ServerArgs:
452
519
 
453
520
  # For PD-Multiplexing
454
521
  enable_pdmux: bool = False
455
- sm_group_num: int = 3
522
+ pdmux_config_path: Optional[str] = None
523
+ sm_group_num: int = 8
456
524
 
457
525
  def __post_init__(self):
458
526
  """
459
527
  Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
460
528
  """
529
+
530
+ if self.model_path.lower() in ["none", "dummy"]:
531
+ # Skip for dummy models
532
+ return
533
+
461
534
  # Handle deprecated arguments.
462
535
  self._handle_deprecated_args()
463
536
 
@@ -477,6 +550,9 @@ class ServerArgs:
477
550
  # Apply model-specific adjustments.
478
551
  self._handle_model_specific_adjustments()
479
552
 
553
+ # Handle Hicache settings.
554
+ self._handle_hicache()
555
+
480
556
  # Set kernel backends.
481
557
  self._handle_sampling_backend()
482
558
  self._handle_attention_backend_compatibility()
@@ -484,21 +560,21 @@ class ServerArgs:
484
560
  self._handle_amd_specifics()
485
561
  self._handle_grammar_backend()
486
562
 
563
+ # Handle Ktransformers specific configs
564
+ self._handle_ktransformers_configs()
565
+
487
566
  # Handle data parallelism.
488
567
  self._handle_data_parallelism()
489
568
 
490
569
  # Handle MoE configurations.
491
570
  self._handle_moe_kernel_config()
492
- self._handle_deepep_moe()
571
+ self._handle_a2a_moe()
493
572
  self._handle_eplb_and_dispatch()
494
573
  self._handle_expert_distribution_metrics()
495
574
 
496
575
  # Handle pipeline parallelism.
497
576
  self._handle_pipeline_parallelism()
498
577
 
499
- # Handle Hicache settings.
500
- self._handle_hicache()
501
-
502
578
  # Handle speculative decoding logic.
503
579
  self._handle_speculative_decoding()
504
580
 
@@ -526,8 +602,17 @@ class ServerArgs:
526
602
  # Handle any other necessary validations.
527
603
  self._handle_other_validations()
528
604
 
605
+ # Handle elastic expert parallelism.
606
+ self._handle_elastic_ep()
607
+
529
608
  def _handle_deprecated_args(self):
530
- pass
609
+ # handle deprecated tool call parsers
610
+ deprecated_tool_call_parsers = {"qwen25": "qwen", "glm45": "glm"}
611
+ if self.tool_call_parser in deprecated_tool_call_parsers:
612
+ logger.warning(
613
+ f"The tool_call_parser '{self.tool_call_parser}' is deprecated. Please use '{deprecated_tool_call_parsers[self.tool_call_parser]}' instead."
614
+ )
615
+ self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]
531
616
 
532
617
  def _handle_missing_default_values(self):
533
618
  if self.tokenizer_path is None:
@@ -571,9 +656,19 @@ class ServerArgs:
571
656
  self.chunked_prefill_size = 2048
572
657
  if self.cuda_graph_max_bs is None:
573
658
  self.cuda_graph_max_bs = 8
659
+ elif is_npu() and gpu_mem < 32 * 1024:
660
+ # Atlas A2B4
661
+ # (chunked_prefill_size 32k, cuda_graph_max_bs 16 if tp < 4 else 64)
662
+ if self.chunked_prefill_size is None:
663
+ self.chunked_prefill_size = 32768
664
+ if self.cuda_graph_max_bs is None:
665
+ if self.tp_size < 4:
666
+ self.cuda_graph_max_bs = 16
667
+ else:
668
+ self.cuda_graph_max_bs = 64
574
669
  elif gpu_mem < 35 * 1024:
575
670
  # A10, 4090, 5090
576
- # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
671
+ # (chunked_prefill_size 2k, cuda_graph_max_bs 24 if tp < 4 else 80)
577
672
  if self.chunked_prefill_size is None:
578
673
  self.chunked_prefill_size = 2048
579
674
  if self.cuda_graph_max_bs is None:
@@ -581,7 +676,7 @@ class ServerArgs:
581
676
  # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
582
677
  # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
583
678
  if self.tp_size < 4:
584
- self.cuda_graph_max_bs = 16
679
+ self.cuda_graph_max_bs = 24
585
680
  else:
586
681
  self.cuda_graph_max_bs = 80
587
682
  elif gpu_mem < 60 * 1024:
@@ -594,6 +689,16 @@ class ServerArgs:
594
689
  self.cuda_graph_max_bs = 32
595
690
  else:
596
691
  self.cuda_graph_max_bs = 160
692
+ elif is_npu() and gpu_mem < 64 * 1024:
693
+ # Atlas A2 and Atlas A3
694
+ # (chunked_prefill_size 32k, cuda_graph_max_bs 64 if tp < 4 else 128)
695
+ if self.chunked_prefill_size is None:
696
+ self.chunked_prefill_size = 32768
697
+ if self.cuda_graph_max_bs is None:
698
+ if self.tp_size < 4:
699
+ self.cuda_graph_max_bs = 64
700
+ else:
701
+ self.cuda_graph_max_bs = 128
597
702
  elif gpu_mem < 90 * 1024:
598
703
  # H100, A100
599
704
  # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
@@ -634,6 +739,11 @@ class ServerArgs:
634
739
  else:
635
740
  self.cuda_graph_max_bs = max(self.cuda_graph_bs)
636
741
 
742
+ if self.piecewise_cuda_graph_tokens is None:
743
+ self.piecewise_cuda_graph_tokens = (
744
+ self._generate_piecewise_cuda_graph_tokens()
745
+ )
746
+
637
747
  if self.mem_fraction_static is None:
638
748
  # Constant meta data (e.g., from attention backend)
639
749
  reserved_mem = 512
@@ -674,11 +784,9 @@ class ServerArgs:
674
784
  else 0.88
675
785
  )
676
786
 
677
- # Lazy init to avoid circular import
678
- # Multimodal models need more memory for the image processor
679
- from sglang.srt.configs.model_config import ModelConfig
680
-
681
- model_config = ModelConfig.from_server_args(self)
787
+ # Multimodal models need more memory for the image processing,
788
+ # so we adjust the mem_fraction_static accordingly.
789
+ model_config = self.get_model_config()
682
790
  if model_config.is_multimodal:
683
791
  self.adjust_mem_fraction_for_vlm(model_config)
684
792
 
@@ -712,6 +820,25 @@ class ServerArgs:
712
820
 
713
821
  return capture_bs
714
822
 
823
+ def _generate_piecewise_cuda_graph_tokens(self):
824
+ """
825
+ Generate the list of batch sizes for piecewise CUDA graph capture
826
+ based on piecewise_cuda_graph_max_tokens.
827
+ """
828
+ capture_sizes = (
829
+ list(range(4, 33, 4))
830
+ + list(range(48, 257, 16))
831
+ + list(range(288, 513, 32))
832
+ + list(range(640, 4096 + 1, 128))
833
+ + list(range(4352, self.piecewise_cuda_graph_max_tokens + 1, 256))
834
+ )
835
+
836
+ capture_sizes = [
837
+ s for s in capture_sizes if s <= self.piecewise_cuda_graph_max_tokens
838
+ ]
839
+
840
+ return capture_sizes
841
+
715
842
  def _handle_hpu_backends(self):
716
843
  if self.device == "hpu":
717
844
  self.attention_backend = "torch_native"
@@ -731,21 +858,59 @@ class ServerArgs:
731
858
 
732
859
  hf_config = self.get_hf_config()
733
860
  model_arch = hf_config.architectures[0]
734
- if model_arch in ["GptOssForCausalLM"]:
735
- if self.attention_backend is None:
861
+ if model_arch in ["DeepseekV3ForCausalLM"] and not is_deepseek_nsa(hf_config):
862
+ if is_cuda() and is_sm100_supported():
863
+ if (
864
+ self.attention_backend is None
865
+ and self.prefill_attention_backend is None
866
+ and self.decode_attention_backend is None
867
+ ):
868
+ self.attention_backend = "trtllm_mla"
869
+ logger.info(
870
+ "Use trtllm_mla as attention backend on sm100 for DeepseekV3ForCausalLM"
871
+ )
872
+ if not self.enable_dp_attention:
873
+ self.enable_flashinfer_allreduce_fusion = True
874
+ logger.info(
875
+ "Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
876
+ )
877
+ if self.moe_runner_backend == "auto":
878
+ self.moe_runner_backend = "flashinfer_trtllm"
879
+ logger.info(
880
+ "Use flashinfer_trtllm as MoE runner backend on sm100 for DeepseekV3ForCausalLM"
881
+ )
882
+ if self.quantization is None:
883
+ # Default DeepSeek V3/R1 native FP8 when not explicitly set,
884
+ # Because we need this condition for an assertion in
885
+ # flashinfer_trtllm MoE runner backend.
886
+ self.quantization = "fp8"
887
+ logger.info(
888
+ "Quantization not specified, default to fp8 for DeepSeek on sm100"
889
+ )
890
+
891
+ elif model_arch in ["GptOssForCausalLM"]:
892
+ if (
893
+ self.attention_backend is None
894
+ and self.prefill_attention_backend is None
895
+ and self.decode_attention_backend is None
896
+ ):
736
897
  if is_cuda() and is_sm100_supported():
737
898
  self.attention_backend = "trtllm_mha"
738
899
  elif is_cuda() and is_sm90_supported():
739
900
  self.attention_backend = "fa3"
740
901
  else:
741
902
  self.attention_backend = "triton"
742
- supported_backends = ["triton", "trtllm_mha", "fa3"]
743
- logger.info(
744
- f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
745
- )
903
+
904
+ supported_backends = ["triton", "trtllm_mha", "fa3", "fa4"]
905
+ prefill_attn_backend, decode_attn_backend = self.get_attention_backends()
746
906
  assert (
747
- self.attention_backend in supported_backends
748
- ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
907
+ prefill_attn_backend in supported_backends
908
+ and decode_attn_backend in supported_backends
909
+ ), (
910
+ f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got the following backends\n"
911
+ f"- Prefill: {prefill_attn_backend}\n"
912
+ f"- Decode: {decode_attn_backend}\n"
913
+ )
749
914
 
750
915
  if is_sm100_supported():
751
916
  if not self.enable_dp_attention:
@@ -788,7 +953,13 @@ class ServerArgs:
788
953
  "fa3",
789
954
  "aiter",
790
955
  "triton",
791
- }, "fa3, aiter, or triton is required for Llama4 model"
956
+ "trtllm_mha",
957
+ }, "fa3, aiter, triton, or trtllm_mha is required for Llama4 model"
958
+ if is_sm100_supported() and self.attention_backend is None:
959
+ self.attention_backend = "trtllm_mha"
960
+ logger.warning(
961
+ "Use trtllm_mha as attention backend on sm100 for Llama4 model"
962
+ )
792
963
  elif model_arch in [
793
964
  "Gemma2ForCausalLM",
794
965
  "Gemma3ForCausalLM",
@@ -802,6 +973,31 @@ class ServerArgs:
802
973
  f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
803
974
  )
804
975
  self.disable_hybrid_swa_memory = True
976
+ elif model_arch in ["Olmo2ForCausalLM"]:
977
+ # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with Olmo3 model.
978
+ logger.warning(
979
+ f"Disabling hybrid SWA memory for {model_arch} as it is not yet supported."
980
+ )
981
+ self.disable_hybrid_swa_memory = True
982
+
983
+ if self.attention_backend is None:
984
+ if is_cuda() and is_sm100_supported():
985
+ self.attention_backend = "trtllm_mha"
986
+ elif is_cuda() and get_device_sm() >= 80:
987
+ self.attention_backend = "fa3"
988
+ else:
989
+ self.attention_backend = "triton"
990
+
991
+ # Flashinfer appears to degrade performance when sliding window attention
992
+ # is used for the Olmo2 architecture. Olmo2 does not use sliding window attention
993
+ # but Olmo3 does.
994
+ assert (
995
+ self.attention_backend != "flashinfer"
996
+ ), "FlashInfer backend can significantly degrade the performance of Olmo3 models."
997
+
998
+ logger.info(
999
+ f"Using {self.attention_backend} as attention backend for {model_arch}."
1000
+ )
805
1001
 
806
1002
  if is_deepseek_nsa(hf_config):
807
1003
  if (
@@ -820,9 +1016,6 @@ class ServerArgs:
820
1016
  self.page_size = 64
821
1017
  logger.warning("Setting page size to 64 for DeepSeek NSA.")
822
1018
 
823
- self.mem_fraction_static = 0.8
824
- logger.warning("Setting mem fraction static to 0.8 for DeepSeek NSA.")
825
-
826
1019
  # For Hopper, we support both bf16 and fp8 kv cache; for Blackwell, we support fp8 only currently
827
1020
  import torch
828
1021
 
@@ -832,10 +1025,10 @@ class ServerArgs:
832
1025
  logger.warning("Setting KV cache dtype to fp8.")
833
1026
 
834
1027
  if self.kv_cache_dtype == "fp8_e4m3":
835
- self.nsa_prefill = "flashmla_decode"
836
- self.nsa_decode = "flashmla_decode"
1028
+ self.nsa_prefill_backend = "flashmla_kv"
1029
+ self.nsa_decode_backend = "flashmla_kv"
837
1030
  logger.warning(
838
- "Setting NSA backend to flashmla_decode for FP8 KV Cache."
1031
+ "Setting NSA backend to flashmla_kv for FP8 KV Cache."
839
1032
  )
840
1033
 
841
1034
  # Logging env vars for NSA
@@ -852,6 +1045,67 @@ class ServerArgs:
852
1045
  )
853
1046
 
854
1047
  def _handle_attention_backend_compatibility(self):
1048
+ model_config = self.get_model_config()
1049
+ use_mla_backend = self.use_mla_backend()
1050
+
1051
+ if self.prefill_attention_backend is not None and (
1052
+ self.prefill_attention_backend == self.decode_attention_backend
1053
+ ): # override the default attention backend
1054
+ self.attention_backend = self.prefill_attention_backend
1055
+
1056
+ # Pick the default attention backend if not specified
1057
+ if self.attention_backend is None:
1058
+ """
1059
+ Auto select the fastest attention backend.
1060
+
1061
+ 1. Models with MHA Architecture (e.g: Llama, QWen)
1062
+ 1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1.
1063
+ 1.2 In other cases, we will use flashinfer if available, otherwise use triton.
1064
+ 2. Models with MLA Architecture and using FA3
1065
+ 2.1 We will use FA3 backend on hopper.
1066
+ 2.2 We will use Flashinfer backend on blackwell.
1067
+ 2.3 Otherwise, we will use triton backend.
1068
+ """
1069
+
1070
+ if not use_mla_backend:
1071
+ # MHA architecture
1072
+ if (
1073
+ is_hopper_with_cuda_12_3()
1074
+ and is_no_spec_infer_or_topk_one(self)
1075
+ and is_fa3_default_architecture(self.model_config.hf_config)
1076
+ ):
1077
+ self.attention_backend = "fa3"
1078
+ elif is_hip():
1079
+ self.attention_backend = "aiter"
1080
+ elif is_npu():
1081
+ self.attention_backend = "ascend"
1082
+ else:
1083
+ self.attention_backend = (
1084
+ "flashinfer" if is_flashinfer_available() else "triton"
1085
+ )
1086
+ else:
1087
+ # MLA architecture
1088
+ if is_hopper_with_cuda_12_3():
1089
+ self.attention_backend = "fa3"
1090
+ elif is_sm100_supported():
1091
+ self.attention_backend = "flashinfer"
1092
+ elif is_hip():
1093
+ head_num = model_config.get_num_kv_heads(self.tp_size)
1094
+ # TODO current aiter only support head number 16 or 128 head number
1095
+ if head_num == 128 or head_num == 16:
1096
+ self.attention_backend = "aiter"
1097
+ else:
1098
+ self.attention_backend = "triton"
1099
+ elif is_npu():
1100
+ self.attention_backend = "ascend"
1101
+ else:
1102
+ self.attention_backend = "triton"
1103
+
1104
+ logger.warning(
1105
+ f"Attention backend not explicitly specified. Use {self.attention_backend} backend by default."
1106
+ )
1107
+
1108
+ # Torch native and flex attention backends
855
1109
  if self.attention_backend == "torch_native":
856
1110
  logger.warning(
857
1111
  "Cuda graph is disabled because of using torch native attention backend"
@@ -867,12 +1121,7 @@ class ServerArgs:
867
1121
  self.speculative_algorithm is None
868
1122
  ), "Speculative decoding is currently not supported with Flex Attention backend"
869
1123
 
870
- if is_npu() and self.attention_backend in ["ascend"]:
871
- logger.warning(
872
- "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
873
- )
874
- self.page_size = 128
875
-
1124
+ # Major NVIDIA platforms backends
876
1125
  if (
877
1126
  self.attention_backend == "flashmla"
878
1127
  or self.decode_attention_backend == "flashmla"
@@ -927,6 +1176,76 @@ class ServerArgs:
927
1176
  )
928
1177
  self.page_size = 64
929
1178
 
1179
+ if self.attention_backend == "fa3" and self.kv_cache_dtype == "fp8_e5m2":
1180
+ logger.warning(
1181
+ "FlashAttention3 only supports fp8_e4m3 if using FP8; "
1182
+ "Setting attention backend to triton."
1183
+ )
1184
+ self.attention_backend = "triton"
1185
+
1186
+ if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
1187
+ raise ValueError(
1188
+ "FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
1189
+ )
1190
+ if self.prefill_attention_backend == "fa4":
1191
+ logger.warning(
1192
+ f"FA4 backend only supports page size 128, changing page_size from {self.page_size} to 128."
1193
+ )
1194
+ self.page_size = 128
1195
+
1196
+ # AMD platforms backends
1197
+ if self.attention_backend == "aiter":
1198
+ if model_config.context_len > 8192:
1199
+ self.mem_fraction_static *= 0.90
1200
+
1201
+ # NPU platforms backends
1202
+ if is_npu() and self.attention_backend in ["ascend"]:
1203
+ logger.warning(
1204
+ "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
1205
+ )
1206
+ self.page_size = 128
1207
+
1208
+ # Other platforms backends
1209
+ if (
1210
+ self.attention_backend == "intel_amx"
1211
+ and self.device == "cpu"
1212
+ and not cpu_has_amx_support()
1213
+ ):
1214
+ logger.warning(
1215
+ "The current platform does not support Intel AMX, will fallback to torch_native backend."
1216
+ )
1217
+ self.attention_backend = "torch_native"
1218
+
1219
+ if (
1220
+ self.attention_backend == "intel_xpu"
1221
+ and self.device == "xpu"
1222
+ and not xpu_has_xmx_support()
1223
+ ):
1224
+ logger.warning(
1225
+ "The current platform does not support Intel XMX, will fallback to triton backend."
1226
+ )
1227
+ self.attention_backend = "triton"
1228
+
1229
+ if self.attention_backend == "intel_xpu":
1230
+ if self.page_size not in [32, 64, 128]:
1231
+ logger.warning(
1232
+ f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
1233
+ )
1234
+ self.page_size = 128
1235
+
1236
+ # Dual chunk flash attention backend
1237
+ if (
1238
+ getattr(model_config.hf_config, "dual_chunk_attention_config", None)
1239
+ is not None
1240
+ ):
1241
+ if self.attention_backend is None:
1242
+ self.attention_backend = "dual_chunk_flash_attn"
1243
+ logger.info("Dual chunk attention is turned on by default.")
1244
+ elif self.attention_backend != "dual_chunk_flash_attn":
1245
+ raise ValueError(
1246
+ "Dual chunk attention is enabled, but attention backend is set to "
1247
+ f"{self.attention_backend}. Please set it to 'dual_chunk_flash_attn'."
1248
+ )
930
1249
  if self.attention_backend == "dual_chunk_flash_attn":
931
1250
  logger.warning(
932
1251
  "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
@@ -946,6 +1265,22 @@ class ServerArgs:
946
1265
  if self.grammar_backend is None:
947
1266
  self.grammar_backend = "xgrammar"
948
1267
 
1268
+ def _handle_ktransformers_configs(self):
1269
+ from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
1270
+ CompressedTensorsWNA16AMXEPMoEMethod,
1271
+ override_config,
1272
+ )
1273
+
1274
+ override_config(
1275
+ CompressedTensorsWNA16AMXEPMoEMethod,
1276
+ self.kt_num_gpu_experts,
1277
+ self.kt_cpuinfer,
1278
+ self.kt_threadpool_count,
1279
+ self.kt_amx_weight_path,
1280
+ self.kt_amx_method,
1281
+ self.chunked_prefill_size,
1282
+ )
1283
+
949
1284
  def _handle_data_parallelism(self):
950
1285
  if self.dp_size == 1:
951
1286
  self.enable_dp_attention = False
@@ -983,7 +1318,7 @@ class ServerArgs:
983
1318
  "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
984
1319
  )
985
1320
 
986
- def _handle_deepep_moe(self):
1321
+ def _handle_a2a_moe(self):
987
1322
  if self.moe_a2a_backend == "deepep":
988
1323
  if self.deepep_mode == "normal":
989
1324
  logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
@@ -993,6 +1328,12 @@ class ServerArgs:
993
1328
  f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
994
1329
  )
995
1330
 
1331
+ if self.moe_a2a_backend == "mooncake":
1332
+ self.ep_size = self.tp_size
1333
+ logger.warning(
1334
+ f"Mooncake MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
1335
+ )
1336
+
996
1337
  def _handle_eplb_and_dispatch(self):
997
1338
  if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
998
1339
  self.expert_distribution_recorder_mode = "stat"
@@ -1008,6 +1349,15 @@ class ServerArgs:
1008
1349
  if self.enable_eplb:
1009
1350
  assert self.ep_size > 1
1010
1351
 
1352
+ def _handle_elastic_ep(self):
1353
+ if self.elastic_ep_backend is not None:
1354
+ if self.enable_eplb:
1355
+ if self.eplb_algorithm == "auto":
1356
+ self.eplb_algorithm = "elasticity_aware"
1357
+ assert (
1358
+ self.eplb_algorithm == "elasticity_aware"
1359
+ ), "Elastic EP requires eplb_algorithm to be set to 'auto' or 'elasticity_aware'."
1360
+
1011
1361
  def _handle_expert_distribution_metrics(self):
1012
1362
  if self.enable_expert_distribution_metrics and (
1013
1363
  self.expert_distribution_recorder_mode is None
@@ -1046,6 +1396,24 @@ class ServerArgs:
1046
1396
  "Page first direct layout only support direct io backend"
1047
1397
  )
1048
1398
 
1399
+ if self.enable_hierarchical_cache and self.hicache_io_backend == "kernel":
1400
+ # fix for the compatibility issue with FlashAttention3 decoding and HiCache kernel backend
1401
+ if self.decode_attention_backend is None:
1402
+ if not self.use_mla_backend():
1403
+ self.decode_attention_backend = (
1404
+ "flashinfer" if is_flashinfer_available() else "triton"
1405
+ )
1406
+ else:
1407
+ self.decode_attention_backend = (
1408
+ "flashinfer" if is_sm100_supported() else "triton"
1409
+ )
1410
+ elif self.decode_attention_backend == "fa3":
1411
+ self.hicache_io_backend = "direct"
1412
+ logger.warning(
1413
+ "FlashAttention3 decode backend is not compatible with hierarchical cache. "
1414
+ "Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes."
1415
+ )
1416
+
1049
1417
  def _handle_speculative_decoding(self):
1050
1418
  if self.speculative_algorithm == "NEXTN":
1051
1419
  self.speculative_algorithm = "EAGLE"
@@ -1056,13 +1424,28 @@ class ServerArgs:
1056
1424
  raise ValueError(
1057
1425
  "Currently standalone speculative decoding does not support dp attention."
1058
1426
  )
1427
+
1059
1428
  if self.max_running_requests is None:
1060
1429
  self.max_running_requests = 48
1061
- self.disable_overlap_schedule = True
1062
- logger.warning(
1063
- "Overlap scheduler is disabled because of using "
1064
- "eagle speculative decoding."
1065
- )
1430
+ logger.warning(
1431
+ "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
1432
+ )
1433
+
1434
+ if (
1435
+ self.speculative_algorithm == "EAGLE"
1436
+ and envs.SGLANG_ENABLE_SPEC_V2.get()
1437
+ ):
1438
+ self.disable_overlap_schedule = False
1439
+ logger.warning(
1440
+ "Beta spec is enabled for eagle speculative decoding and overlap schedule is turned on."
1441
+ )
1442
+
1443
+ if not envs.SGLANG_ENABLE_SPEC_V2.get():
1444
+ self.disable_overlap_schedule = True
1445
+ logger.warning(
1446
+ "Overlap scheduler is disabled because of using eagle3 or standalone speculative decoding."
1447
+ )
1448
+
1066
1449
  if self.enable_mixed_chunk:
1067
1450
  self.enable_mixed_chunk = False
1068
1451
  logger.warning(
@@ -1129,8 +1512,13 @@ class ServerArgs:
1129
1512
  raise ValueError(
1130
1513
  "Ngram speculative decoding only supports CUDA device."
1131
1514
  )
1515
+
1132
1516
  if self.max_running_requests is None:
1133
1517
  self.max_running_requests = 48
1518
+ logger.warning(
1519
+ "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
1520
+ )
1521
+
1134
1522
  self.disable_overlap_schedule = True
1135
1523
  self.enable_mixed_chunk = False
1136
1524
  self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
@@ -1216,6 +1604,26 @@ class ServerArgs:
1216
1604
  "Please choose one tokenizer batching approach."
1217
1605
  )
1218
1606
 
1607
+ if self.skip_tokenizer_init:
1608
+ if self.tokenizer_worker_num != 1:
1609
+ logger.warning(
1610
+ "skip_tokenizer_init=True disables tokenizer workers; forcing tokenizer_worker_num=1 "
1611
+ f"(requested {self.tokenizer_worker_num})."
1612
+ )
1613
+ self.tokenizer_worker_num = 1
1614
+
1615
+ if self.enable_tokenizer_batch_encode:
1616
+ logger.warning(
1617
+ "skip_tokenizer_init=True ignores --enable-tokenizer-batch-encode; disabling it."
1618
+ )
1619
+ self.enable_tokenizer_batch_encode = False
1620
+
1621
+ if self.enable_dynamic_batch_tokenizer:
1622
+ logger.warning(
1623
+ "skip_tokenizer_init=True ignores --enable-dynamic-batch-tokenizer; disabling it."
1624
+ )
1625
+ self.enable_dynamic_batch_tokenizer = False
1626
+
1219
1627
  def _handle_environment_variables(self):
1220
1628
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
1221
1629
  "1" if self.enable_torch_compile else "0"
@@ -1253,21 +1661,65 @@ class ServerArgs:
1253
1661
  )
1254
1662
 
1255
1663
  def _handle_deterministic_inference(self):
1664
+ if self.rl_on_policy_target is not None:
1665
+ logger.warning(
1666
+ "Enable deterministic inference because of rl_on_policy_target."
1667
+ )
1668
+ self.enable_deterministic_inference = True
1669
+ # TODO remove this environment variable as a whole
1670
+ os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = "1"
1671
+
1256
1672
  if self.enable_deterministic_inference:
1257
1673
  # Check sampling backend
1258
1674
  self.sampling_backend = "pytorch"
1259
1675
  logger.warning(
1260
1676
  "Sampling backend is set to pytorch for deterministic inference."
1261
1677
  )
1678
+ is_deepseek_model = False
1679
+ if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
1680
+ try:
1681
+ hf_config = self.get_hf_config()
1682
+ model_arch = hf_config.architectures[0]
1683
+ is_deepseek_model = model_arch in [
1684
+ "DeepseekV2ForCausalLM",
1685
+ "DeepseekV3ForCausalLM",
1686
+ "DeepseekV32ForCausalLM",
1687
+ ]
1688
+ except Exception:
1689
+ pass
1262
1690
 
1263
1691
  # Check attention backend
1264
- if self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
1692
+ if self.attention_backend is None:
1693
+ # User didn't specify attention backend, fallback based on GPU architecture
1694
+ if is_sm100_supported() or is_sm120_supported():
1695
+ # Blackwell and newer architectures
1696
+ if is_deepseek_model:
1697
+ # fallback to triton for DeepSeek models because flashinfer doesn't support deterministic inference for DeepSeek models yet
1698
+ self.attention_backend = "triton"
1699
+ else:
1700
+ # fallback to flashinfer on Blackwell for non-DeepSeek models
1701
+ self.attention_backend = "flashinfer"
1702
+ else:
1703
+ # Hopper (SM90) and older architectures
1704
+ self.attention_backend = "fa3"
1705
+ logger.warning(
1706
+ f"Attention backend not specified. Falling back to '{self.attention_backend}' for deterministic inference. "
1707
+ f"You can explicitly set --attention-backend to one of {DETERMINISTIC_ATTENTION_BACKEND_CHOICES}."
1708
+ )
1709
+ elif self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
1710
+ # User explicitly specified an incompatible attention backend
1265
1711
  raise ValueError(
1266
- f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference."
1712
+ f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference, "
1713
+ f"but you explicitly specified '{self.attention_backend}'."
1267
1714
  )
1268
1715
 
1269
- # Currently, only FA3 supports radix cache. Support for other backends is in progress
1270
- if self.attention_backend != "fa3":
1716
+ if self.attention_backend not in ["fa3", "triton"]:
1717
+ if is_deepseek_model:
1718
+ raise ValueError(
1719
+ f"Currently only fa3 and triton attention backends are supported for deterministic inference with DeepSeek models. But you're using {self.attention_backend}."
1720
+ )
1721
+
1722
+ # Currently, only FA3 and Triton supports radix cache. Support for other backends is in progress
1271
1723
  self.disable_radix_cache = True
1272
1724
  logger.warning(
1273
1725
  f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
@@ -1286,6 +1738,7 @@ class ServerArgs:
1286
1738
 
1287
1739
  @staticmethod
1288
1740
  def add_cli_args(parser: argparse.ArgumentParser):
1741
+
1289
1742
  # Model and tokenizer
1290
1743
  parser.add_argument(
1291
1744
  "--model-path",
@@ -1405,6 +1858,11 @@ class ServerArgs:
1405
1858
  default=ServerArgs.port,
1406
1859
  help="The port of the HTTP server.",
1407
1860
  )
1861
+ parser.add_argument(
1862
+ "--grpc-mode",
1863
+ action="store_true",
1864
+ help="If set, use gRPC server instead of HTTP server.",
1865
+ )
1408
1866
  parser.add_argument(
1409
1867
  "--skip-server-warmup",
1410
1868
  action="store_true",
@@ -1423,6 +1881,12 @@ class ServerArgs:
1423
1881
  default=ServerArgs.nccl_port,
1424
1882
  help="The port for NCCL distributed environment setup. Defaults to a random port.",
1425
1883
  )
1884
+ parser.add_argument(
1885
+ "--checkpoint-engine-wait-weights-before-ready",
1886
+ action="store_true",
1887
+ help="If set, the server will wait for initial weights to be loaded via checkpoint-engine or other update methods "
1888
+ "before serving inference requests.",
1889
+ )
1426
1890
 
1427
1891
  # Quantization and data type
1428
1892
  parser.add_argument(
@@ -1459,14 +1923,53 @@ class ServerArgs:
1459
1923
  "--kv-cache-dtype",
1460
1924
  type=str,
1461
1925
  default=ServerArgs.kv_cache_dtype,
1462
- choices=["auto", "fp8_e5m2", "fp8_e4m3"],
1463
- help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
1926
+ choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
1927
+ help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
1464
1928
  )
1465
1929
  parser.add_argument(
1466
1930
  "--enable-fp32-lm-head",
1467
1931
  action="store_true",
1468
1932
  help="If set, the LM head outputs (logits) are in FP32.",
1469
1933
  )
1934
+ parser.add_argument(
1935
+ "--modelopt-quant",
1936
+ type=str,
1937
+ default=ServerArgs.modelopt_quant,
1938
+ help="The ModelOpt quantization configuration. "
1939
+ "Supported values: 'fp8', 'int4_awq', 'w4a8_awq', 'nvfp4', 'nvfp4_awq'. "
1940
+ "This requires the NVIDIA Model Optimizer library to be installed: pip install nvidia-modelopt",
1941
+ )
1942
+ parser.add_argument(
1943
+ "--modelopt-checkpoint-restore-path",
1944
+ type=str,
1945
+ default=ServerArgs.modelopt_checkpoint_restore_path,
1946
+ help="Path to restore a previously saved ModelOpt quantized checkpoint. "
1947
+ "If provided, the quantization process will be skipped and the model "
1948
+ "will be loaded from this checkpoint.",
1949
+ )
1950
+ parser.add_argument(
1951
+ "--modelopt-checkpoint-save-path",
1952
+ type=str,
1953
+ default=ServerArgs.modelopt_checkpoint_save_path,
1954
+ help="Path to save the ModelOpt quantized checkpoint after quantization. "
1955
+ "This allows reusing the quantized model in future runs.",
1956
+ )
1957
+ parser.add_argument(
1958
+ "--modelopt-export-path",
1959
+ type=str,
1960
+ default=ServerArgs.modelopt_export_path,
1961
+ help="Path to export the quantized model in HuggingFace format after ModelOpt quantization. "
1962
+ "The exported model can then be used directly with SGLang for inference. "
1963
+ "If not provided, the model will not be exported.",
1964
+ )
1965
+ parser.add_argument(
1966
+ "--quantize-and-serve",
1967
+ action="store_true",
1968
+ default=ServerArgs.quantize_and_serve,
1969
+ help="Quantize the model with ModelOpt and immediately serve it without exporting. "
1970
+ "This is useful for development and prototyping. For production, it's recommended "
1971
+ "to use separate quantization and deployment steps.",
1972
+ )
1470
1973
 
1471
1974
  # Memory and scheduling
1472
1975
  parser.add_argument(
@@ -1519,6 +2022,12 @@ class ServerArgs:
1519
2022
  default=ServerArgs.enable_priority_scheduling,
1520
2023
  help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
1521
2024
  )
2025
+ parser.add_argument(
2026
+ "--abort-on-priority-when-disabled",
2027
+ action="store_true",
2028
+ default=ServerArgs.abort_on_priority_when_disabled,
2029
+ help="If set, abort requests that specify a priority when priority scheduling is disabled.",
2030
+ )
1522
2031
  parser.add_argument(
1523
2032
  "--schedule-low-priority-values-first",
1524
2033
  action="store_true",
@@ -1565,7 +2074,14 @@ class ServerArgs:
1565
2074
  parser.add_argument(
1566
2075
  "--disable-hybrid-swa-memory",
1567
2076
  action="store_true",
1568
- help="Disable the hybrid SWA memory.",
2077
+ help="Disable the hybrid SWA memory pool.",
2078
+ )
2079
+ parser.add_argument(
2080
+ "--radix-eviction-policy",
2081
+ type=str,
2082
+ choices=RADIX_EVICTION_POLICY_CHOICES,
2083
+ default=ServerArgs.radix_eviction_policy,
2084
+ help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
1569
2085
  )
1570
2086
 
1571
2087
  # Runtime options
@@ -1590,9 +2106,9 @@ class ServerArgs:
1590
2106
  help="The pipeline parallelism size.",
1591
2107
  )
1592
2108
  parser.add_argument(
1593
- "--max-micro-batch-size",
2109
+ "--pp-max-micro-batch-size",
1594
2110
  type=int,
1595
- default=ServerArgs.max_micro_batch_size,
2111
+ default=ServerArgs.pp_max_micro_batch_size,
1596
2112
  help="The maximum micro batch size in pipeline parallelism.",
1597
2113
  )
1598
2114
  parser.add_argument(
@@ -1616,7 +2132,12 @@ class ServerArgs:
1616
2132
  "--constrained-json-whitespace-pattern",
1617
2133
  type=str,
1618
2134
  default=ServerArgs.constrained_json_whitespace_pattern,
1619
- help="(outlines backend only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
2135
+ help="(outlines and llguidance backends only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
2136
+ )
2137
+ parser.add_argument(
2138
+ "--constrained-json-disable-any-whitespace",
2139
+ action="store_true",
2140
+ help="(xgrammar and llguidance backends only) Enforce compact representation in JSON constrained output.",
1620
2141
  )
1621
2142
  parser.add_argument(
1622
2143
  "--watchdog-timeout",
@@ -1863,6 +2384,16 @@ class ServerArgs:
1863
2384
  default=None,
1864
2385
  help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
1865
2386
  )
2387
+ parser.add_argument(
2388
+ "--sampling-defaults",
2389
+ type=str,
2390
+ choices=["openai", "model"],
2391
+ default=ServerArgs.sampling_defaults,
2392
+ help="Where to get default sampling parameters. "
2393
+ "'openai' uses SGLang/OpenAI defaults (temperature=1.0, top_p=1.0, etc.). "
2394
+ "'model' uses the model's generation_config.json to get the recommended "
2395
+ "sampling parameters if available. Default is 'model'.",
2396
+ )
1866
2397
 
1867
2398
  # Data parallelism
1868
2399
  parser.add_argument(
@@ -1966,6 +2497,13 @@ class ServerArgs:
1966
2497
  default=ServerArgs.max_loaded_loras,
1967
2498
  help="If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`.",
1968
2499
  )
2500
+ parser.add_argument(
2501
+ "--lora-eviction-policy",
2502
+ type=str,
2503
+ default=ServerArgs.lora_eviction_policy,
2504
+ choices=["lru", "fifo"],
2505
+ help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
2506
+ )
1969
2507
  parser.add_argument(
1970
2508
  "--lora-backend",
1971
2509
  type=str,
@@ -2025,14 +2563,14 @@ class ServerArgs:
2025
2563
  help="Set multimodal attention backend.",
2026
2564
  )
2027
2565
  parser.add_argument(
2028
- "--nsa-prefill",
2029
- default=ServerArgs.nsa_prefill,
2566
+ "--nsa-prefill-backend",
2567
+ default=ServerArgs.nsa_prefill_backend,
2030
2568
  type=str,
2031
2569
  choices=NSA_CHOICES,
2032
2570
  )
2033
2571
  parser.add_argument(
2034
- "--nsa-decode",
2035
- default=ServerArgs.nsa_decode,
2572
+ "--nsa-decode-backend",
2573
+ default=ServerArgs.nsa_decode_backend,
2036
2574
  type=str,
2037
2575
  choices=NSA_CHOICES,
2038
2576
  )
@@ -2058,6 +2596,15 @@ class ServerArgs:
2058
2596
  "name, a tag name, or a commit id. If unspecified, will use "
2059
2597
  "the default version.",
2060
2598
  )
2599
+ parser.add_argument(
2600
+ "--speculative-draft-load-format",
2601
+ type=str,
2602
+ default=ServerArgs.speculative_draft_load_format,
2603
+ choices=LOAD_FORMAT_CHOICES,
2604
+ help="The format of the draft model weights to load. "
2605
+ "If not specified, will use the same format as --load-format. "
2606
+ "Use 'dummy' to initialize draft model weights with random values for profiling.",
2607
+ )
2061
2608
  parser.add_argument(
2062
2609
  "--speculative-num-steps",
2063
2610
  type=int,
@@ -2158,22 +2705,14 @@ class ServerArgs:
2158
2705
  parser.add_argument(
2159
2706
  "--moe-a2a-backend",
2160
2707
  type=str,
2161
- choices=["none", "deepep"],
2708
+ choices=["none", "deepep", "mooncake"],
2162
2709
  default=ServerArgs.moe_a2a_backend,
2163
2710
  help="Choose the backend for MoE A2A.",
2164
2711
  )
2165
2712
  parser.add_argument(
2166
2713
  "--moe-runner-backend",
2167
2714
  type=str,
2168
- choices=[
2169
- "auto",
2170
- "triton",
2171
- "triton_kernel",
2172
- "flashinfer_trtllm",
2173
- "flashinfer_cutlass",
2174
- "flashinfer_mxfp4",
2175
- "flashinfer_cutedsl",
2176
- ],
2715
+ choices=MOE_RUNNER_BACKEND_CHOICES,
2177
2716
  default=ServerArgs.moe_runner_backend,
2178
2717
  help="Choose the runner backend for MoE.",
2179
2718
  )
@@ -2272,6 +2811,21 @@ class ServerArgs:
2272
2811
  default=ServerArgs.moe_dense_tp_size,
2273
2812
  help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
2274
2813
  )
2814
+ parser.add_argument(
2815
+ "--elastic-ep-backend",
2816
+ type=str,
2817
+ default=ServerArgs.elastic_ep_backend,
2818
+ choices=["none", "mooncake"],
2819
+ help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
2820
+ )
2821
+ parser.add_argument(
2822
+ "--mooncake-ib-device",
2823
+ type=str,
2824
+ default=ServerArgs.mooncake_ib_device,
2825
+ help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
2826
+ "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
2827
+ "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
2828
+ )
2275
2829
 
2276
2830
  # Mamba Cache
2277
2831
  parser.add_argument(
@@ -2287,6 +2841,12 @@ class ServerArgs:
2287
2841
  choices=["float32", "bfloat16"],
2288
2842
  help="The data type of the SSM states in mamba cache.",
2289
2843
  )
2844
+ parser.add_argument(
2845
+ "--mamba-full-memory-ratio",
2846
+ type=float,
2847
+ default=ServerArgs.mamba_full_memory_ratio,
2848
+ help="The ratio of mamba state memory to full kv cache memory.",
2849
+ )
2290
2850
 
2291
2851
  # Hierarchical cache
2292
2852
  parser.add_argument(
@@ -2313,13 +2873,6 @@ class ServerArgs:
2313
2873
  default=ServerArgs.hicache_write_policy,
2314
2874
  help="The write policy of hierarchical cache.",
2315
2875
  )
2316
- parser.add_argument(
2317
- "--radix-eviction-policy",
2318
- type=str,
2319
- choices=RADIX_EVICTION_POLICY_CHOICES,
2320
- default=ServerArgs.radix_eviction_policy,
2321
- help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
2322
- )
2323
2876
  parser.add_argument(
2324
2877
  "--hicache-io-backend",
2325
2878
  type=str,
@@ -2364,6 +2917,35 @@ class ServerArgs:
2364
2917
  help="Using LMCache as an alternative hierarchical cache solution",
2365
2918
  )
2366
2919
 
2920
+ # Ktransformer server args
2921
+ parser.add_argument(
2922
+ "--kt-amx-weight-path",
2923
+ type=str,
2924
+ help="[ktransformers parameter] The path of the quantized expert weights for amx kernel. A local folder.",
2925
+ )
2926
+ parser.add_argument(
2927
+ "--kt-amx-method",
2928
+ type=str,
2929
+ default="AMXINT4",
2930
+ help="[ktransformers parameter] Quantization formats for CPU execution.",
2931
+ )
2932
+ parser.add_argument(
2933
+ "--kt-cpuinfer",
2934
+ type=int,
2935
+ help="[ktransformers parameter] The number of CPUInfer threads.",
2936
+ )
2937
+ parser.add_argument(
2938
+ "--kt-threadpool-count",
2939
+ type=int,
2940
+ default=2,
2941
+ help="[ktransformers parameter] One-to-one with the number of NUMA nodes (one thread pool per NUMA).",
2942
+ )
2943
+ parser.add_argument(
2944
+ "--kt-num-gpu-experts",
2945
+ type=int,
2946
+ help="[ktransformers parameter] The number of GPU experts.",
2947
+ )
2948
+
2367
2949
  # Double Sparsity
2368
2950
  parser.add_argument(
2369
2951
  "--enable-double-sparsity",
@@ -2398,7 +2980,7 @@ class ServerArgs:
2398
2980
  "--ds-sparse-decode-threshold",
2399
2981
  type=int,
2400
2982
  default=ServerArgs.ds_sparse_decode_threshold,
2401
- help="The type of heavy channels in double sparsity attention",
2983
+ help="The minimum decode sequence length required before the double-sparsity backend switches from the dense fallback to the sparse decode kernel.",
2402
2984
  )
2403
2985
 
2404
2986
  # Offloading
@@ -2433,6 +3015,14 @@ class ServerArgs:
2433
3015
  help="Mode of offloading.",
2434
3016
  )
2435
3017
 
3018
+ # Args for multi-item-scoring
3019
+ parser.add_argument(
3020
+ "--multi-item-scoring-delimiter",
3021
+ type=int,
3022
+ default=ServerArgs.multi_item_scoring_delimiter,
3023
+ help="Delimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: Query<delimiter>Item1<delimiter>Item2<delimiter>... This enables efficient batch processing of multiple items against a single query.",
3024
+ )
3025
+
2436
3026
  # Optimization/debug options
2437
3027
  parser.add_argument(
2438
3028
  "--disable-radix-cache",
@@ -2491,6 +3081,11 @@ class ServerArgs:
2491
3081
  action="store_true",
2492
3082
  help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
2493
3083
  )
3084
+ parser.add_argument(
3085
+ "--disable-tokenizer-batch-decode",
3086
+ action="store_true",
3087
+ help="Disable batch decoding when decoding multiple completions.",
3088
+ )
2494
3089
  parser.add_argument(
2495
3090
  "--disable-outlines-disk-cache",
2496
3091
  action="store_true",
@@ -2552,12 +3147,36 @@ class ServerArgs:
2552
3147
  action="store_true",
2553
3148
  help="Optimize the model with torch.compile. Experimental feature.",
2554
3149
  )
3150
+ parser.add_argument(
3151
+ "--enable-piecewise-cuda-graph",
3152
+ action="store_true",
3153
+ help="Optimize the model with piecewise cuda graph for extend/prefill only. Experimental feature.",
3154
+ )
3155
+ parser.add_argument(
3156
+ "--piecewise-cuda-graph-tokens",
3157
+ type=json_list_type,
3158
+ default=ServerArgs.piecewise_cuda_graph_tokens,
3159
+ help="Set the list of tokens when using piecewise cuda graph.",
3160
+ )
3161
+ parser.add_argument(
3162
+ "--piecewise-cuda-graph-compiler",
3163
+ type=str,
3164
+ default=ServerArgs.piecewise_cuda_graph_compiler,
3165
+ help="Set the compiler for piecewise cuda graph. Choices are: eager, inductor.",
3166
+ choices=["eager", "inductor"],
3167
+ )
2555
3168
  parser.add_argument(
2556
3169
  "--torch-compile-max-bs",
2557
3170
  type=int,
2558
3171
  default=ServerArgs.torch_compile_max_bs,
2559
3172
  help="Set the maximum batch size when using torch compile.",
2560
3173
  )
3174
+ parser.add_argument(
3175
+ "--piecewise-cuda-graph-max-tokens",
3176
+ type=int,
3177
+ default=ServerArgs.piecewise_cuda_graph_max_tokens,
3178
+ help="Set the maximum tokens when using piecewise cuda graph.",
3179
+ )
2561
3180
  parser.add_argument(
2562
3181
  "--torchao-config",
2563
3182
  type=str,
@@ -2667,31 +3286,20 @@ class ServerArgs:
2667
3286
  nargs="+",
2668
3287
  help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
2669
3288
  )
2670
-
2671
- # Debug tensor dumps
2672
3289
  parser.add_argument(
2673
- "--debug-tensor-dump-output-folder",
2674
- type=str,
2675
- default=ServerArgs.debug_tensor_dump_output_folder,
2676
- help="The output folder for dumping tensors.",
2677
- )
2678
- parser.add_argument(
2679
- "--debug-tensor-dump-input-file",
2680
- type=str,
2681
- default=ServerArgs.debug_tensor_dump_input_file,
2682
- help="The input filename for dumping tensors",
3290
+ "--enable-deterministic-inference",
3291
+ action="store_true",
3292
+ help="Enable deterministic inference mode with batch invariant ops.",
2683
3293
  )
2684
3294
  parser.add_argument(
2685
- "--debug-tensor-dump-inject",
3295
+ "--rl-on-policy-target",
2686
3296
  type=str,
2687
- default=ServerArgs.debug_tensor_dump_inject,
2688
- help="Inject the outputs from jax as the input of every layer.",
2689
- )
2690
- parser.add_argument(
2691
- "--debug-tensor-dump-prefill-only",
2692
- action="store_true",
2693
- help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
3297
+ default=ServerArgs.rl_on_policy_target,
3298
+ choices=["fsdp"],
3299
+ help="The training system that SGLang needs to match for true on-policy.",
2694
3300
  )
3301
+
3302
+ # Dynamic batch tokenizer
2695
3303
  parser.add_argument(
2696
3304
  "--enable-dynamic-batch-tokenizer",
2697
3305
  action="store_true",
@@ -2710,6 +3318,26 @@ class ServerArgs:
2710
3318
  help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
2711
3319
  )
2712
3320
 
3321
+ # Debug tensor dumps
3322
+ parser.add_argument(
3323
+ "--debug-tensor-dump-output-folder",
3324
+ type=str,
3325
+ default=ServerArgs.debug_tensor_dump_output_folder,
3326
+ help="The output folder for dumping tensors.",
3327
+ )
3328
+ parser.add_argument(
3329
+ "--debug-tensor-dump-input-file",
3330
+ type=str,
3331
+ default=ServerArgs.debug_tensor_dump_input_file,
3332
+ help="The input filename for dumping tensors",
3333
+ )
3334
+ parser.add_argument(
3335
+ "--debug-tensor-dump-inject",
3336
+ type=str,
3337
+ default=ServerArgs.debug_tensor_dump_inject,
3338
+ help="Inject the outputs from jax as the input of every layer.",
3339
+ )
3340
+
2713
3341
  # PD disaggregation
2714
3342
  parser.add_argument(
2715
3343
  "--disaggregation-mode",
@@ -2813,7 +3441,12 @@ class ServerArgs:
2813
3441
  action="store_true",
2814
3442
  help="Enable PD-Multiplexing, PD running on greenctx stream.",
2815
3443
  )
2816
-
3444
+ parser.add_argument(
3445
+ "--pdmux-config-path",
3446
+ type=str,
3447
+ default=None,
3448
+ help="The path of the PD-Multiplexing config file.",
3449
+ )
2817
3450
  parser.add_argument(
2818
3451
  "--sm-group-num",
2819
3452
  type=int,
@@ -2821,50 +3454,6 @@ class ServerArgs:
2821
3454
  help="Number of sm partition groups.",
2822
3455
  )
2823
3456
 
2824
- # For deterministic inference
2825
- parser.add_argument(
2826
- "--enable-deterministic-inference",
2827
- action="store_true",
2828
- help="Enable deterministic inference mode with batch invariant ops.",
2829
- )
2830
-
2831
- # Deprecated arguments
2832
- parser.add_argument(
2833
- "--enable-ep-moe",
2834
- action=DeprecatedAction,
2835
- help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
2836
- )
2837
- parser.add_argument(
2838
- "--enable-deepep-moe",
2839
- action=DeprecatedAction,
2840
- help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
2841
- )
2842
- parser.add_argument(
2843
- "--enable-flashinfer-cutlass-moe",
2844
- action=DeprecatedAction,
2845
- help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
2846
- )
2847
- parser.add_argument(
2848
- "--enable-flashinfer-cutedsl-moe",
2849
- action=DeprecatedAction,
2850
- help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
2851
- )
2852
- parser.add_argument(
2853
- "--enable-flashinfer-trtllm-moe",
2854
- action=DeprecatedAction,
2855
- help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
2856
- )
2857
- parser.add_argument(
2858
- "--enable-triton-kernel-moe",
2859
- action=DeprecatedAction,
2860
- help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
2861
- )
2862
- parser.add_argument(
2863
- "--enable-flashinfer-mxfp4-moe",
2864
- action=DeprecatedAction,
2865
- help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
2866
- )
2867
-
2868
3457
  # Configuration file support
2869
3458
  parser.add_argument(
2870
3459
  "--config",
@@ -2894,11 +3483,39 @@ class ServerArgs:
2894
3483
  self.model_path,
2895
3484
  trust_remote_code=self.trust_remote_code,
2896
3485
  revision=self.revision,
2897
- model_override_args=json.loads(self.json_model_override_args),
3486
+ model_override_args=orjson.loads(self.json_model_override_args),
2898
3487
  **kwargs,
2899
3488
  )
2900
3489
  return hf_config
2901
3490
 
3491
+ def get_model_config(self):
3492
+ # Lazy init to avoid circular import
3493
+ from sglang.srt.configs.model_config import ModelConfig
3494
+
3495
+ if hasattr(self, "model_config"):
3496
+ return self.model_config
3497
+ self.model_config = ModelConfig.from_server_args(self)
3498
+ return self.model_config
3499
+
3500
+ def get_attention_backends(self):
3501
+ prefill_attention_backend_str = (
3502
+ self.prefill_attention_backend
3503
+ if self.prefill_attention_backend
3504
+ else self.attention_backend
3505
+ )
3506
+ decode_attention_backend_str = (
3507
+ self.decode_attention_backend
3508
+ if self.decode_attention_backend
3509
+ else self.attention_backend
3510
+ )
3511
+ return prefill_attention_backend_str, decode_attention_backend_str
3512
+
3513
+ def use_mla_backend(self):
3514
+ from sglang.srt.configs.model_config import AttentionArch
3515
+
3516
+ model_config = self.get_model_config()
3517
+ return model_config.attention_arch == AttentionArch.MLA
3518
+
2902
3519
  def check_server_args(self):
2903
3520
  # Check parallel size constraints
2904
3521
  assert (
@@ -2941,7 +3558,34 @@ class ServerArgs:
2941
3558
  self.chunked_prefill_size % self.page_size == 0
2942
3559
  ), "chunked_prefill_size must be divisible by page_size"
2943
3560
 
2944
- # Check multi tokenizer
3561
+ # Check pdmux
3562
+ if self.enable_pdmux:
3563
+ assert (
3564
+ self.pp_size == 1
3565
+ ), "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)."
3566
+ assert (
3567
+ self.chunked_prefill_size == -1
3568
+ ), "PD-Multiplexing is not compatible with chunked prefill."
3569
+ assert (
3570
+ self.disaggregation_mode == "null"
3571
+ ), "PD-Multiplexing is not compatible with disaggregation mode."
3572
+ assert (
3573
+ self.disable_overlap_schedule
3574
+ ), "PD-Multiplexing is not compatible with overlap schedule."
3575
+
3576
+ # NOTE: CUDA Green Context may encounter potential issues with CudaGraph on torch 2.7.x – 2.8.x, leading to performance degradation.
3577
+ import torch
3578
+
3579
+ parts = torch.__version__.split("+", 1)[0].split(".")
3580
+ major = int(parts[0]) if len(parts) > 0 and parts[0].isdigit() else 0
3581
+ minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
3582
+ if (major, minor) > (2, 6):
3583
+ logger.warning(
3584
+ "WARNING: PD-Multiplexing may experience performance degradation with torch versions > 2.6.x.\n"
3585
+ f" Current torch version is {torch.__version__}.\n"
3586
+ " Please manually install torch 2.6.x."
3587
+ )
3588
+
2945
3589
  assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
2946
3590
  self.validate_buckets_rule(
2947
3591
  "--prompt-tokens-buckets", self.prompt_tokens_buckets
@@ -2957,6 +3601,17 @@ class ServerArgs:
2957
3601
  "lof",
2958
3602
  ], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
2959
3603
 
3604
+ # Check multi-item scoring
3605
+ if self.multi_item_scoring_delimiter is not None:
3606
+ assert self.disable_radix_cache, (
3607
+ "Multi-item scoring requires radix cache to be disabled. "
3608
+ "Please set --disable-radix-cache when using --multi-item-scoring-delimiter."
3609
+ )
3610
+ assert self.chunked_prefill_size == -1, (
3611
+ "Multi-item scoring requires chunked prefill to be disabled. "
3612
+ "Please set --chunked-prefill-size -1 when using --multi-item-scoring-delimiter."
3613
+ )
3614
+
2960
3615
  def check_lora_server_args(self):
2961
3616
  assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
2962
3617
 
@@ -3141,6 +3796,22 @@ class ServerArgs:
3141
3796
  )
3142
3797
 
3143
3798
 
3799
+ # NOTE: This is a global variable to hold the server args for scheduler.
3800
+ _global_server_args: Optional[ServerArgs] = None
3801
+
3802
+
3803
+ def set_global_server_args_for_scheduler(server_args: ServerArgs):
3804
+ global _global_server_args
3805
+ _global_server_args = server_args
3806
+
3807
+
3808
+ def get_global_server_args() -> ServerArgs:
3809
+ if _global_server_args is None:
3810
+ raise ValueError("Global server args is not set yet!")
3811
+
3812
+ return _global_server_args
3813
+
3814
+
3144
3815
  def prepare_server_args(argv: List[str]) -> ServerArgs:
3145
3816
  """
3146
3817
  Prepare the server arguments from the command line arguments.
@@ -3175,11 +3846,12 @@ def prepare_server_args(argv: List[str]) -> ServerArgs:
3175
3846
  parser = argparse.ArgumentParser()
3176
3847
  ServerArgs.add_cli_args(parser)
3177
3848
  raw_args = parser.parse_args(argv)
3178
- server_args = ServerArgs.from_cli_args(raw_args)
3179
- return server_args
3849
+
3850
+ return ServerArgs.from_cli_args(raw_args)
3180
3851
 
3181
3852
 
3182
3853
  ZMQ_TCP_PORT_DELTA = 233
3854
+ DP_ATTENTION_HANDSHAKE_PORT_DELTA = 5
3183
3855
 
3184
3856
 
3185
3857
  @dataclasses.dataclass
@@ -3204,7 +3876,11 @@ class PortArgs:
3204
3876
  tokenizer_worker_ipc_name: Optional[str]
3205
3877
 
3206
3878
  @staticmethod
3207
- def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
3879
+ def init_new(
3880
+ server_args: ServerArgs,
3881
+ dp_rank: Optional[int] = None,
3882
+ worker_ports: Optional[List[int]] = None,
3883
+ ) -> PortArgs:
3208
3884
  if server_args.nccl_port is None:
3209
3885
  nccl_port = server_args.port + random.randint(100, 1000)
3210
3886
  while True:
@@ -3217,6 +3893,13 @@ class PortArgs:
3217
3893
  else:
3218
3894
  nccl_port = server_args.nccl_port
3219
3895
 
3896
+ if server_args.tokenizer_worker_num > 1:
3897
+ tokenizer_worker_ipc_name = (
3898
+ f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
3899
+ )
3900
+ else:
3901
+ tokenizer_worker_ipc_name = None
3902
+
3220
3903
  if not server_args.enable_dp_attention:
3221
3904
  # Normal case, use IPC within a single node
3222
3905
  return PortArgs(
@@ -3226,7 +3909,7 @@ class PortArgs:
3226
3909
  nccl_port=nccl_port,
3227
3910
  rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
3228
3911
  metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
3229
- tokenizer_worker_ipc_name=None,
3912
+ tokenizer_worker_ipc_name=tokenizer_worker_ipc_name,
3230
3913
  )
3231
3914
  else:
3232
3915
  # DP attention. Use TCP + port to handle both single-node and multi-node.
@@ -3251,8 +3934,8 @@ class PortArgs:
3251
3934
  # TokenizerManager to DataParallelController
3252
3935
  scheduler_input_port = port_base + 4
3253
3936
  else:
3254
- scheduler_input_port = port_base + 4 + 1 + dp_rank
3255
-
3937
+ assert worker_ports is not None
3938
+ scheduler_input_port = worker_ports[dp_rank]
3256
3939
  return PortArgs(
3257
3940
  tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
3258
3941
  scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
@@ -3260,7 +3943,7 @@ class PortArgs:
3260
3943
  nccl_port=nccl_port,
3261
3944
  rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
3262
3945
  metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
3263
- tokenizer_worker_ipc_name=None,
3946
+ tokenizer_worker_ipc_name=tokenizer_worker_ipc_name,
3264
3947
  )
3265
3948
 
3266
3949