sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (408) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +330 -156
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +8 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +134 -23
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +70 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +66 -66
  69. sglang/srt/entrypoints/grpc_server.py +431 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +120 -8
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +42 -4
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +3 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +18 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/utils.py +2 -2
  93. sglang/srt/grpc/compile_proto.py +3 -3
  94. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  95. sglang/srt/grpc/health_servicer.py +189 -0
  96. sglang/srt/grpc/scheduler_launcher.py +181 -0
  97. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  98. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  99. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  100. sglang/srt/layers/activation.py +4 -1
  101. sglang/srt/layers/attention/aiter_backend.py +3 -3
  102. sglang/srt/layers/attention/ascend_backend.py +17 -1
  103. sglang/srt/layers/attention/attention_registry.py +43 -23
  104. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  105. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  106. sglang/srt/layers/attention/fla/chunk.py +0 -1
  107. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  108. sglang/srt/layers/attention/fla/index.py +0 -2
  109. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  110. sglang/srt/layers/attention/fla/utils.py +0 -3
  111. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  112. sglang/srt/layers/attention/flashattention_backend.py +12 -8
  113. sglang/srt/layers/attention/flashinfer_backend.py +248 -21
  114. sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
  115. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  116. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  117. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  118. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  119. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  121. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  122. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  123. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  124. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  125. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  127. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  128. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  129. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  130. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  131. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  132. sglang/srt/layers/attention/nsa/utils.py +0 -1
  133. sglang/srt/layers/attention/nsa_backend.py +404 -90
  134. sglang/srt/layers/attention/triton_backend.py +208 -34
  135. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  136. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  137. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  138. sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
  139. sglang/srt/layers/attention/utils.py +11 -7
  140. sglang/srt/layers/attention/vision.py +3 -3
  141. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  142. sglang/srt/layers/communicator.py +11 -7
  143. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  146. sglang/srt/layers/dp_attention.py +17 -0
  147. sglang/srt/layers/layernorm.py +45 -15
  148. sglang/srt/layers/linear.py +9 -1
  149. sglang/srt/layers/logits_processor.py +147 -17
  150. sglang/srt/layers/modelopt_utils.py +11 -0
  151. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  152. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  153. sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
  154. sglang/srt/layers/moe/ep_moe/layer.py +119 -397
  155. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  159. sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
  160. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  161. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  162. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  163. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  164. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  165. sglang/srt/layers/moe/router.py +51 -15
  166. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  167. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  168. sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
  169. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  170. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  171. sglang/srt/layers/moe/topk.py +3 -2
  172. sglang/srt/layers/moe/utils.py +17 -1
  173. sglang/srt/layers/quantization/__init__.py +2 -53
  174. sglang/srt/layers/quantization/awq.py +183 -6
  175. sglang/srt/layers/quantization/awq_triton.py +29 -0
  176. sglang/srt/layers/quantization/base_config.py +20 -1
  177. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  178. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  179. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  180. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  181. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  183. sglang/srt/layers/quantization/fp8.py +84 -18
  184. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  185. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  186. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  187. sglang/srt/layers/quantization/gptq.py +0 -1
  188. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  189. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  190. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  191. sglang/srt/layers/quantization/mxfp4.py +5 -30
  192. sglang/srt/layers/quantization/petit.py +1 -1
  193. sglang/srt/layers/quantization/quark/quark.py +3 -1
  194. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  195. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  196. sglang/srt/layers/quantization/unquant.py +1 -4
  197. sglang/srt/layers/quantization/utils.py +0 -1
  198. sglang/srt/layers/quantization/w4afp8.py +51 -20
  199. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  200. sglang/srt/layers/radix_attention.py +59 -9
  201. sglang/srt/layers/rotary_embedding.py +673 -16
  202. sglang/srt/layers/sampler.py +36 -16
  203. sglang/srt/layers/sparse_pooler.py +98 -0
  204. sglang/srt/layers/utils.py +0 -1
  205. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  206. sglang/srt/lora/backend/triton_backend.py +0 -1
  207. sglang/srt/lora/eviction_policy.py +139 -0
  208. sglang/srt/lora/lora_manager.py +24 -9
  209. sglang/srt/lora/lora_registry.py +1 -1
  210. sglang/srt/lora/mem_pool.py +40 -16
  211. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  212. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  213. sglang/srt/managers/cache_controller.py +48 -17
  214. sglang/srt/managers/data_parallel_controller.py +146 -42
  215. sglang/srt/managers/detokenizer_manager.py +40 -13
  216. sglang/srt/managers/io_struct.py +66 -16
  217. sglang/srt/managers/mm_utils.py +20 -18
  218. sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
  219. sglang/srt/managers/overlap_utils.py +96 -19
  220. sglang/srt/managers/schedule_batch.py +241 -511
  221. sglang/srt/managers/schedule_policy.py +15 -2
  222. sglang/srt/managers/scheduler.py +399 -499
  223. sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
  224. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  225. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  226. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  227. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  228. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  229. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  230. sglang/srt/managers/tokenizer_manager.py +378 -90
  231. sglang/srt/managers/tp_worker.py +212 -161
  232. sglang/srt/managers/utils.py +78 -2
  233. sglang/srt/mem_cache/allocator.py +7 -2
  234. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  235. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  236. sglang/srt/mem_cache/chunk_cache.py +13 -2
  237. sglang/srt/mem_cache/common.py +480 -0
  238. sglang/srt/mem_cache/evict_policy.py +16 -1
  239. sglang/srt/mem_cache/hicache_storage.py +4 -1
  240. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  241. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  242. sglang/srt/mem_cache/memory_pool.py +435 -219
  243. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  244. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  245. sglang/srt/mem_cache/radix_cache.py +53 -19
  246. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  247. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  249. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  250. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  251. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  252. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  253. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  254. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  255. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  256. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  257. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  258. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  259. sglang/srt/metrics/collector.py +31 -0
  260. sglang/srt/metrics/func_timer.py +1 -1
  261. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  262. sglang/srt/model_executor/forward_batch_info.py +28 -23
  263. sglang/srt/model_executor/model_runner.py +379 -139
  264. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  265. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  266. sglang/srt/model_loader/__init__.py +1 -1
  267. sglang/srt/model_loader/loader.py +424 -27
  268. sglang/srt/model_loader/utils.py +0 -1
  269. sglang/srt/model_loader/weight_utils.py +47 -28
  270. sglang/srt/models/apertus.py +2 -3
  271. sglang/srt/models/arcee.py +2 -2
  272. sglang/srt/models/bailing_moe.py +13 -52
  273. sglang/srt/models/bailing_moe_nextn.py +3 -4
  274. sglang/srt/models/bert.py +1 -1
  275. sglang/srt/models/deepseek_nextn.py +19 -3
  276. sglang/srt/models/deepseek_ocr.py +1516 -0
  277. sglang/srt/models/deepseek_v2.py +273 -98
  278. sglang/srt/models/dots_ocr.py +0 -2
  279. sglang/srt/models/dots_vlm.py +0 -1
  280. sglang/srt/models/dots_vlm_vit.py +1 -1
  281. sglang/srt/models/falcon_h1.py +13 -19
  282. sglang/srt/models/gemma3_mm.py +16 -0
  283. sglang/srt/models/gemma3n_mm.py +1 -2
  284. sglang/srt/models/glm4_moe.py +14 -37
  285. sglang/srt/models/glm4_moe_nextn.py +2 -2
  286. sglang/srt/models/glm4v.py +2 -1
  287. sglang/srt/models/glm4v_moe.py +5 -5
  288. sglang/srt/models/gpt_oss.py +5 -5
  289. sglang/srt/models/grok.py +10 -23
  290. sglang/srt/models/hunyuan.py +2 -7
  291. sglang/srt/models/interns1.py +0 -1
  292. sglang/srt/models/kimi_vl.py +1 -7
  293. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  294. sglang/srt/models/llama.py +2 -2
  295. sglang/srt/models/llama_eagle3.py +1 -1
  296. sglang/srt/models/longcat_flash.py +5 -22
  297. sglang/srt/models/longcat_flash_nextn.py +3 -14
  298. sglang/srt/models/mimo.py +2 -13
  299. sglang/srt/models/mimo_mtp.py +1 -2
  300. sglang/srt/models/minicpmo.py +7 -5
  301. sglang/srt/models/mixtral.py +1 -4
  302. sglang/srt/models/mllama.py +1 -1
  303. sglang/srt/models/mllama4.py +13 -3
  304. sglang/srt/models/nemotron_h.py +511 -0
  305. sglang/srt/models/olmo2.py +31 -4
  306. sglang/srt/models/opt.py +5 -5
  307. sglang/srt/models/phi.py +1 -1
  308. sglang/srt/models/phi4mm.py +1 -1
  309. sglang/srt/models/phimoe.py +0 -1
  310. sglang/srt/models/pixtral.py +0 -3
  311. sglang/srt/models/points_v15_chat.py +186 -0
  312. sglang/srt/models/qwen.py +0 -1
  313. sglang/srt/models/qwen2_5_vl.py +3 -3
  314. sglang/srt/models/qwen2_audio.py +2 -15
  315. sglang/srt/models/qwen2_moe.py +15 -12
  316. sglang/srt/models/qwen2_vl.py +5 -2
  317. sglang/srt/models/qwen3_moe.py +19 -35
  318. sglang/srt/models/qwen3_next.py +7 -12
  319. sglang/srt/models/qwen3_next_mtp.py +3 -4
  320. sglang/srt/models/qwen3_omni_moe.py +661 -0
  321. sglang/srt/models/qwen3_vl.py +37 -33
  322. sglang/srt/models/qwen3_vl_moe.py +57 -185
  323. sglang/srt/models/roberta.py +55 -3
  324. sglang/srt/models/sarashina2_vision.py +0 -1
  325. sglang/srt/models/step3_vl.py +3 -5
  326. sglang/srt/models/utils.py +11 -1
  327. sglang/srt/multimodal/processors/base_processor.py +6 -2
  328. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  329. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  330. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  331. sglang/srt/multimodal/processors/glm4v.py +1 -5
  332. sglang/srt/multimodal/processors/internvl.py +0 -2
  333. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  334. sglang/srt/multimodal/processors/mllama4.py +0 -8
  335. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  336. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  337. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  338. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  339. sglang/srt/parser/conversation.py +41 -0
  340. sglang/srt/parser/reasoning_parser.py +0 -1
  341. sglang/srt/sampling/custom_logit_processor.py +77 -2
  342. sglang/srt/sampling/sampling_batch_info.py +17 -22
  343. sglang/srt/sampling/sampling_params.py +70 -2
  344. sglang/srt/server_args.py +577 -73
  345. sglang/srt/server_args_config_parser.py +1 -1
  346. sglang/srt/single_batch_overlap.py +38 -28
  347. sglang/srt/speculative/base_spec_worker.py +34 -0
  348. sglang/srt/speculative/draft_utils.py +226 -0
  349. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  350. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  351. sglang/srt/speculative/eagle_info.py +57 -18
  352. sglang/srt/speculative/eagle_info_v2.py +458 -0
  353. sglang/srt/speculative/eagle_utils.py +138 -0
  354. sglang/srt/speculative/eagle_worker.py +83 -280
  355. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  356. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  357. sglang/srt/speculative/ngram_worker.py +12 -11
  358. sglang/srt/speculative/spec_info.py +2 -0
  359. sglang/srt/speculative/spec_utils.py +38 -3
  360. sglang/srt/speculative/standalone_worker.py +4 -14
  361. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  362. sglang/srt/two_batch_overlap.py +28 -14
  363. sglang/srt/utils/__init__.py +1 -1
  364. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  365. sglang/srt/utils/common.py +192 -47
  366. sglang/srt/utils/hf_transformers_utils.py +40 -17
  367. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  368. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  369. sglang/srt/utils/profile_merger.py +199 -0
  370. sglang/test/attention/test_flashattn_backend.py +1 -1
  371. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  372. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  373. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  374. sglang/test/few_shot_gsm8k_engine.py +2 -4
  375. sglang/test/kit_matched_stop.py +157 -0
  376. sglang/test/longbench_v2/__init__.py +1 -0
  377. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  378. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  379. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  380. sglang/test/run_eval.py +41 -0
  381. sglang/test/runners.py +2 -0
  382. sglang/test/send_one.py +42 -7
  383. sglang/test/simple_eval_common.py +3 -0
  384. sglang/test/simple_eval_gpqa.py +0 -1
  385. sglang/test/simple_eval_humaneval.py +0 -3
  386. sglang/test/simple_eval_longbench_v2.py +344 -0
  387. sglang/test/test_block_fp8.py +1 -2
  388. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  389. sglang/test/test_cutlass_moe.py +1 -2
  390. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  391. sglang/test/test_deterministic.py +232 -99
  392. sglang/test/test_deterministic_utils.py +73 -0
  393. sglang/test/test_disaggregation_utils.py +81 -0
  394. sglang/test/test_marlin_moe.py +0 -1
  395. sglang/test/test_utils.py +85 -20
  396. sglang/version.py +1 -1
  397. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
  398. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
  399. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  400. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  401. sglang/srt/speculative/build_eagle_tree.py +0 -427
  402. sglang/test/test_block_fp8_ep.py +0 -358
  403. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  404. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  405. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  406. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -13,6 +13,8 @@
13
13
  # ==============================================================================
14
14
  """The arguments of the server."""
15
15
 
16
+ from __future__ import annotations
17
+
16
18
  import argparse
17
19
  import dataclasses
18
20
  import json
@@ -20,7 +22,9 @@ import logging
20
22
  import os
21
23
  import random
22
24
  import tempfile
23
- from typing import List, Literal, Optional, Union
25
+ from typing import Dict, List, Literal, Optional, Union
26
+
27
+ import orjson
24
28
 
25
29
  from sglang.srt.connector import ConnectorType
26
30
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
@@ -32,6 +36,7 @@ from sglang.srt.utils import (
32
36
  configure_ipv6,
33
37
  get_device,
34
38
  get_device_memory_capacity,
39
+ get_device_sm,
35
40
  is_cuda,
36
41
  is_flashinfer_available,
37
42
  is_hip,
@@ -40,6 +45,7 @@ from sglang.srt.utils import (
40
45
  is_remote_url,
41
46
  is_sm90_supported,
42
47
  is_sm100_supported,
48
+ is_sm120_supported,
43
49
  is_triton_kernels_available,
44
50
  is_valid_ipv6_address,
45
51
  json_list_type,
@@ -51,6 +57,7 @@ from sglang.utils import is_in_ci
51
57
 
52
58
  logger = logging.getLogger(__name__)
53
59
 
60
+
54
61
  # Define constants
55
62
  LOAD_FORMAT_CHOICES = [
56
63
  "auto",
@@ -76,6 +83,7 @@ QUANTIZATION_CHOICES = [
76
83
  "bitsandbytes",
77
84
  "gguf",
78
85
  "modelopt",
86
+ "modelopt_fp8",
79
87
  "modelopt_fp4",
80
88
  "petit_nvfp4",
81
89
  "w8a8_int8",
@@ -84,6 +92,7 @@ QUANTIZATION_CHOICES = [
84
92
  "qoq",
85
93
  "w4afp8",
86
94
  "mxfp4",
95
+ "compressed-tensors", # for Ktransformers
87
96
  ]
88
97
 
89
98
  ATTENTION_BACKEND_CHOICES = [
@@ -107,6 +116,7 @@ ATTENTION_BACKEND_CHOICES = [
107
116
  # Other platforms
108
117
  "intel_amx",
109
118
  "ascend",
119
+ "intel_xpu",
110
120
  ]
111
121
 
112
122
  LORA_BACKEND_CHOICES = ["triton", "csgmv"]
@@ -117,10 +127,24 @@ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
117
127
 
118
128
  DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
119
129
 
120
- NSA_CHOICES = ["flashmla_prefill", "flashmla_decode", "fa3", "tilelang", "aiter"]
130
+ DEFAULT_LORA_EVICTION_POLICY = "lru"
131
+
132
+ NSA_CHOICES = ["flashmla_sparse", "flashmla_kv", "fa3", "tilelang", "aiter"]
121
133
 
122
134
  RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
123
135
 
136
+ MOE_RUNNER_BACKEND_CHOICES = [
137
+ "auto",
138
+ "deep_gemm",
139
+ "triton",
140
+ "triton_kernel",
141
+ "flashinfer_trtllm",
142
+ "flashinfer_cutlass",
143
+ "flashinfer_mxfp4",
144
+ "flashinfer_cutedsl",
145
+ "cutlass",
146
+ ]
147
+
124
148
 
125
149
  # Allow external code to add more choices
126
150
  def add_load_format_choices(choices):
@@ -143,6 +167,10 @@ def add_grammar_backend_choices(choices):
143
167
  GRAMMAR_BACKEND_CHOICES.extend(choices)
144
168
 
145
169
 
170
+ def add_moe_runner_backend_choices(choices):
171
+ MOE_RUNNER_BACKEND_CHOICES.extend(choices)
172
+
173
+
146
174
  def add_deterministic_attention_backend_choices(choices):
147
175
  DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
148
176
 
@@ -162,6 +190,11 @@ class ServerArgs:
162
190
  load_format: str = "auto"
163
191
  model_loader_extra_config: str = "{}"
164
192
  trust_remote_code: bool = False
193
+ modelopt_quant: Optional[Union[str, Dict]] = None
194
+ modelopt_checkpoint_restore_path: Optional[str] = None
195
+ modelopt_checkpoint_save_path: Optional[str] = None
196
+ modelopt_export_path: Optional[str] = None
197
+ quantize_and_serve: bool = False
165
198
  context_length: Optional[int] = None
166
199
  is_embedding: bool = False
167
200
  enable_multimodal: Optional[bool] = None
@@ -171,9 +204,11 @@ class ServerArgs:
171
204
  # HTTP server
172
205
  host: str = "127.0.0.1"
173
206
  port: int = 30000
207
+ grpc_mode: bool = False
174
208
  skip_server_warmup: bool = False
175
209
  warmups: Optional[str] = None
176
210
  nccl_port: Optional[int] = None
211
+ checkpoint_engine_wait_weights_before_ready: bool = False
177
212
 
178
213
  # Quantization and data type
179
214
  dtype: str = "auto"
@@ -191,6 +226,7 @@ class ServerArgs:
191
226
  max_prefill_tokens: int = 16384
192
227
  schedule_policy: str = "fcfs"
193
228
  enable_priority_scheduling: bool = False
229
+ abort_on_priority_when_disabled: bool = False
194
230
  schedule_low_priority_values_first: bool = False
195
231
  priority_scheduling_preemption_threshold: int = 10
196
232
  schedule_conservativeness: float = 1.0
@@ -202,13 +238,16 @@ class ServerArgs:
202
238
 
203
239
  # Runtime options
204
240
  device: Optional[str] = None
241
+ elastic_ep_backend: Literal[None, "mooncake"] = None
242
+ mooncake_ib_device: Optional[str] = None
205
243
  tp_size: int = 1
206
244
  pp_size: int = 1
207
- max_micro_batch_size: Optional[int] = None
245
+ pp_max_micro_batch_size: Optional[int] = None
208
246
  stream_interval: int = 1
209
247
  stream_output: bool = False
210
248
  random_seed: Optional[int] = None
211
249
  constrained_json_whitespace_pattern: Optional[str] = None
250
+ constrained_json_disable_any_whitespace: bool = False
212
251
  watchdog_timeout: float = 300
213
252
  dist_timeout: Optional[int] = None # timeout for torch.distributed
214
253
  download_dir: Optional[str] = None
@@ -251,6 +290,7 @@ class ServerArgs:
251
290
  reasoning_parser: Optional[str] = None
252
291
  tool_call_parser: Optional[str] = None
253
292
  tool_server: Optional[str] = None
293
+ sampling_defaults: str = "model"
254
294
 
255
295
  # Data parallelism
256
296
  dp_size: int = 1
@@ -277,6 +317,7 @@ class ServerArgs:
277
317
  ] = None
278
318
  max_loaded_loras: Optional[int] = None
279
319
  max_loras_per_batch: int = 8
320
+ lora_eviction_policy: str = DEFAULT_LORA_EVICTION_POLICY
280
321
  lora_backend: str = "triton"
281
322
  max_lora_chunk_size: Optional[int] = 16
282
323
 
@@ -287,13 +328,15 @@ class ServerArgs:
287
328
  sampling_backend: Optional[str] = None
288
329
  grammar_backend: Optional[str] = None
289
330
  mm_attention_backend: Optional[str] = None
290
- nsa_prefill: str = "flashmla_prefill"
291
- nsa_decode: str = "fa3"
331
+ nsa_prefill_backend: str = "flashmla_sparse"
332
+ nsa_decode_backend: str = "fa3"
292
333
 
293
334
  # Speculative decoding
335
+ enable_beta_spec: bool = False
294
336
  speculative_algorithm: Optional[str] = None
295
337
  speculative_draft_model_path: Optional[str] = None
296
338
  speculative_draft_model_revision: Optional[str] = None
339
+ speculative_draft_load_format: Optional[str] = None
297
340
  speculative_num_steps: Optional[int] = None
298
341
  speculative_eagle_topk: Optional[int] = None
299
342
  speculative_num_draft_tokens: Optional[int] = None
@@ -312,15 +355,8 @@ class ServerArgs:
312
355
 
313
356
  # Expert parallelism
314
357
  ep_size: int = 1
315
- moe_a2a_backend: Literal["none", "deepep"] = "none"
316
- moe_runner_backend: Literal[
317
- "auto",
318
- "triton",
319
- "triton_kernel",
320
- "flashinfer_trtllm",
321
- "flashinfer_cutlass",
322
- "flashinfer_mxfp4",
323
- ] = "auto"
358
+ moe_a2a_backend: Literal["none", "deepep", "mooncake"] = "none"
359
+ moe_runner_backend: str = "auto"
324
360
  flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
325
361
  enable_flashinfer_allreduce_fusion: bool = False
326
362
  deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
@@ -343,6 +379,7 @@ class ServerArgs:
343
379
  # Mamba cache
344
380
  max_mamba_cache_size: Optional[int] = None
345
381
  mamba_ssm_dtype: str = "float32"
382
+ mamba_full_memory_ratio: float = 0.9
346
383
 
347
384
  # Hierarchical cache
348
385
  enable_hierarchical_cache: bool = False
@@ -357,6 +394,13 @@ class ServerArgs:
357
394
  # LMCache
358
395
  enable_lmcache: bool = False
359
396
 
397
+ # Ktransformers
398
+ kt_amx_weight_path: Optional[str] = None
399
+ kt_amx_method: Optional[str] = None
400
+ kt_cpuinfer: Optional[int] = None
401
+ kt_threadpool_count: Optional[int] = None
402
+ kt_num_gpu_experts: Optional[int] = None
403
+
360
404
  # Double Sparsity
361
405
  enable_double_sparsity: bool = False
362
406
  ds_channel_config_path: Optional[str] = None
@@ -372,6 +416,12 @@ class ServerArgs:
372
416
  offload_prefetch_step: int = 1
373
417
  offload_mode: str = "cpu"
374
418
 
419
+ # Scoring configuration
420
+ # Delimiter token ID used to combine Query and Items into a single sequence for multi-item scoring.
421
+ # Format: Query<delimiter>Item1<delimiter>Item2<delimiter>...
422
+ # This enables efficient batch processing of multiple items against a single query.
423
+ multi_item_scoring_delimiter: Optional[Union[int]] = None
424
+
375
425
  # Optimization/debug options
376
426
  disable_radix_cache: bool = False
377
427
  cuda_graph_max_bs: Optional[int] = None
@@ -384,6 +434,7 @@ class ServerArgs:
384
434
  enable_symm_mem: bool = False
385
435
  disable_flashinfer_cutlass_moe_fp4_allgather: bool = False
386
436
  enable_tokenizer_batch_encode: bool = False
437
+ disable_tokenizer_batch_decode: bool = False
387
438
  disable_outlines_disk_cache: bool = False
388
439
  disable_custom_all_reduce: bool = False
389
440
  enable_mscclpp: bool = False
@@ -396,7 +447,11 @@ class ServerArgs:
396
447
  enable_single_batch_overlap: bool = False
397
448
  tbo_token_distribution_threshold: float = 0.48
398
449
  enable_torch_compile: bool = False
450
+ enable_piecewise_cuda_graph: bool = False
399
451
  torch_compile_max_bs: int = 32
452
+ piecewise_cuda_graph_max_tokens: int = 4096
453
+ piecewise_cuda_graph_tokens: Optional[List[int]] = None
454
+ piecewise_cuda_graph_compiler: str = "eager"
400
455
  torchao_config: str = ""
401
456
  enable_nan_detection: bool = False
402
457
  enable_p2p_check: bool = False
@@ -428,7 +483,6 @@ class ServerArgs:
428
483
  debug_tensor_dump_output_folder: Optional[str] = None
429
484
  debug_tensor_dump_input_file: Optional[str] = None
430
485
  debug_tensor_dump_inject: bool = False
431
- debug_tensor_dump_prefill_only: bool = False
432
486
 
433
487
  # PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
434
488
  disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
@@ -452,12 +506,31 @@ class ServerArgs:
452
506
 
453
507
  # For PD-Multiplexing
454
508
  enable_pdmux: bool = False
455
- sm_group_num: int = 3
509
+ pdmux_config_path: Optional[str] = None
510
+ sm_group_num: int = 8
511
+
512
+ def get_attention_backends(server_args):
513
+ prefill_attention_backend_str = (
514
+ server_args.prefill_attention_backend
515
+ if server_args.prefill_attention_backend
516
+ else server_args.attention_backend
517
+ )
518
+ decode_attention_backend_str = (
519
+ server_args.decode_attention_backend
520
+ if server_args.decode_attention_backend
521
+ else server_args.attention_backend
522
+ )
523
+ return prefill_attention_backend_str, decode_attention_backend_str
456
524
 
457
525
  def __post_init__(self):
458
526
  """
459
527
  Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
460
528
  """
529
+
530
+ if self.model_path.lower() in ["none", "dummy"]:
531
+ # Skip for dummy models
532
+ return
533
+
461
534
  # Handle deprecated arguments.
462
535
  self._handle_deprecated_args()
463
536
 
@@ -484,12 +557,15 @@ class ServerArgs:
484
557
  self._handle_amd_specifics()
485
558
  self._handle_grammar_backend()
486
559
 
560
+ # Handle Ktransformers specific configs
561
+ self._handle_ktransformers_configs()
562
+
487
563
  # Handle data parallelism.
488
564
  self._handle_data_parallelism()
489
565
 
490
566
  # Handle MoE configurations.
491
567
  self._handle_moe_kernel_config()
492
- self._handle_deepep_moe()
568
+ self._handle_a2a_moe()
493
569
  self._handle_eplb_and_dispatch()
494
570
  self._handle_expert_distribution_metrics()
495
571
 
@@ -526,8 +602,33 @@ class ServerArgs:
526
602
  # Handle any other necessary validations.
527
603
  self._handle_other_validations()
528
604
 
605
+ # Handle elastic expert parallelism.
606
+ self._handle_elastic_ep()
607
+
529
608
  def _handle_deprecated_args(self):
530
- pass
609
+ # handle deprecated tool call parsers
610
+ deprecated_tool_call_parsers = {"qwen25": "qwen", "glm45": "glm"}
611
+ if self.tool_call_parser in deprecated_tool_call_parsers:
612
+ logger.warning(
613
+ f"The tool_call_parser '{self.tool_call_parser}' is deprecated. Please use '{deprecated_tool_call_parsers[self.tool_call_parser]}' instead."
614
+ )
615
+ self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]
616
+
617
+ def _handle_ktransformers_configs(self):
618
+ from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
619
+ CompressedTensorsWNA16AMXEPMoEMethod,
620
+ override_config,
621
+ )
622
+
623
+ override_config(
624
+ CompressedTensorsWNA16AMXEPMoEMethod,
625
+ self.kt_num_gpu_experts,
626
+ self.kt_cpuinfer,
627
+ self.kt_threadpool_count,
628
+ self.kt_amx_weight_path,
629
+ self.kt_amx_method,
630
+ self.chunked_prefill_size,
631
+ )
531
632
 
532
633
  def _handle_missing_default_values(self):
533
634
  if self.tokenizer_path is None:
@@ -571,6 +672,16 @@ class ServerArgs:
571
672
  self.chunked_prefill_size = 2048
572
673
  if self.cuda_graph_max_bs is None:
573
674
  self.cuda_graph_max_bs = 8
675
+ elif is_npu() and gpu_mem < 32 * 1024:
676
+ # Atlas A2B4
677
+ # (chunked_prefill_size 32k, cuda_graph_max_bs 16 if tp < 4 else 64)
678
+ if self.chunked_prefill_size is None:
679
+ self.chunked_prefill_size = 32768
680
+ if self.cuda_graph_max_bs is None:
681
+ if self.tp_size < 4:
682
+ self.cuda_graph_max_bs = 16
683
+ else:
684
+ self.cuda_graph_max_bs = 64
574
685
  elif gpu_mem < 35 * 1024:
575
686
  # A10, 4090, 5090
576
687
  # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
@@ -594,6 +705,16 @@ class ServerArgs:
594
705
  self.cuda_graph_max_bs = 32
595
706
  else:
596
707
  self.cuda_graph_max_bs = 160
708
+ elif is_npu() and gpu_mem < 64 * 1024:
709
+ # Atlas A2 and Atlas A3
710
+ # (chunked_prefill_size 32k, cuda_graph_max_bs 64 if tp < 4 else 128)
711
+ if self.chunked_prefill_size is None:
712
+ self.chunked_prefill_size = 32768
713
+ if self.cuda_graph_max_bs is None:
714
+ if self.tp_size < 4:
715
+ self.cuda_graph_max_bs = 64
716
+ else:
717
+ self.cuda_graph_max_bs = 128
597
718
  elif gpu_mem < 90 * 1024:
598
719
  # H100, A100
599
720
  # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
@@ -634,6 +755,11 @@ class ServerArgs:
634
755
  else:
635
756
  self.cuda_graph_max_bs = max(self.cuda_graph_bs)
636
757
 
758
+ if self.piecewise_cuda_graph_tokens is None:
759
+ self.piecewise_cuda_graph_tokens = (
760
+ self._generate_piecewise_cuda_graph_tokens()
761
+ )
762
+
637
763
  if self.mem_fraction_static is None:
638
764
  # Constant meta data (e.g., from attention backend)
639
765
  reserved_mem = 512
@@ -712,6 +838,25 @@ class ServerArgs:
712
838
 
713
839
  return capture_bs
714
840
 
841
+ def _generate_piecewise_cuda_graph_tokens(self):
842
+ """
843
+ Generate the list of batch sizes for piecewise CUDA graph capture
844
+ based on piecewise_cuda_graph_max_tokens.
845
+ """
846
+ capture_sizes = (
847
+ list(range(4, 33, 4))
848
+ + list(range(48, 257, 16))
849
+ + list(range(288, 513, 32))
850
+ + list(range(640, 4096 + 1, 128))
851
+ + list(range(4352, self.piecewise_cuda_graph_max_tokens + 1, 256))
852
+ )
853
+
854
+ capture_sizes = [
855
+ s for s in capture_sizes if s <= self.piecewise_cuda_graph_max_tokens
856
+ ]
857
+
858
+ return capture_sizes
859
+
715
860
  def _handle_hpu_backends(self):
716
861
  if self.device == "hpu":
717
862
  self.attention_backend = "torch_native"
@@ -731,21 +876,54 @@ class ServerArgs:
731
876
 
732
877
  hf_config = self.get_hf_config()
733
878
  model_arch = hf_config.architectures[0]
734
- if model_arch in ["GptOssForCausalLM"]:
735
- if self.attention_backend is None:
879
+ if model_arch in ["DeepseekV3ForCausalLM"] and not is_deepseek_nsa(hf_config):
880
+ if is_cuda() and is_sm100_supported():
881
+ if (
882
+ self.attention_backend is None
883
+ and self.prefill_attention_backend is None
884
+ and self.decode_attention_backend is None
885
+ ):
886
+ self.attention_backend = "trtllm_mla"
887
+ logger.info(
888
+ "Use trtllm_mla as attention backend on sm100 for DeepseekV3ForCausalLM"
889
+ )
890
+ if not self.enable_dp_attention:
891
+ self.enable_flashinfer_allreduce_fusion = True
892
+ logger.info(
893
+ "Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
894
+ )
895
+ if (
896
+ self.quantization == "modelopt_fp4"
897
+ and self.moe_runner_backend == "auto"
898
+ ):
899
+ self.moe_runner_backend = "flashinfer_trtllm"
900
+ logger.info(
901
+ "Use flashinfer_trtllm as moe runner backend on sm100 for DeepseekV3ForCausalLM"
902
+ )
903
+
904
+ elif model_arch in ["GptOssForCausalLM"]:
905
+ if (
906
+ self.attention_backend is None
907
+ and self.prefill_attention_backend is None
908
+ and self.decode_attention_backend is None
909
+ ):
736
910
  if is_cuda() and is_sm100_supported():
737
911
  self.attention_backend = "trtllm_mha"
738
912
  elif is_cuda() and is_sm90_supported():
739
913
  self.attention_backend = "fa3"
740
914
  else:
741
915
  self.attention_backend = "triton"
742
- supported_backends = ["triton", "trtllm_mha", "fa3"]
743
- logger.info(
744
- f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
745
- )
916
+
917
+ supported_backends = ["triton", "trtllm_mha", "fa3", "fa4"]
918
+ prefill_attn_backend, decode_attn_backend = self.get_attention_backends()
746
919
  assert (
747
- self.attention_backend in supported_backends
748
- ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
920
+ prefill_attn_backend in supported_backends
921
+ and decode_attn_backend in supported_backends
922
+ ), (
923
+ f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got the following backends\n"
924
+ f"- Prefill: {prefill_attn_backend}\n"
925
+ f"- Decode: {decode_attn_backend}\n"
926
+ )
749
927
 
750
928
  if is_sm100_supported():
751
929
  if not self.enable_dp_attention:
@@ -802,6 +980,31 @@ class ServerArgs:
802
980
  f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
803
981
  )
804
982
  self.disable_hybrid_swa_memory = True
983
+ elif model_arch in ["Olmo2ForCausalLM"]:
984
+ # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with Olmo3 model.
985
+ logger.warning(
986
+ f"Disabling hybrid SWA memory for {model_arch} as it is not yet supported."
987
+ )
988
+ self.disable_hybrid_swa_memory = True
989
+
990
+ if self.attention_backend is None:
991
+ if is_cuda() and is_sm100_supported():
992
+ self.attention_backend = "trtllm_mha"
993
+ elif is_cuda() and get_device_sm() >= 80:
994
+ self.attention_backend = "fa3"
995
+ else:
996
+ self.attention_backend = "triton"
997
+
998
+ # Flashinfer appears to degrade performance when sliding window attention
999
+ # is used for the Olmo2 architecture. Olmo2 does not use sliding window attention
1000
+ # but Olmo3 does.
1001
+ assert (
1002
+ self.attention_backend != "flashinfer"
1003
+ ), "FlashInfer backend can significantly degrade the performance of Olmo3 models."
1004
+
1005
+ logger.info(
1006
+ f"Using {self.attention_backend} as attention backend for {model_arch}."
1007
+ )
805
1008
 
806
1009
  if is_deepseek_nsa(hf_config):
807
1010
  if (
@@ -820,9 +1023,6 @@ class ServerArgs:
820
1023
  self.page_size = 64
821
1024
  logger.warning("Setting page size to 64 for DeepSeek NSA.")
822
1025
 
823
- self.mem_fraction_static = 0.8
824
- logger.warning("Setting mem fraction static to 0.8 for DeepSeek NSA.")
825
-
826
1026
  # For Hopper, we support both bf16 and fp8 kv cache; for Blackwell, we support fp8 only currently
827
1027
  import torch
828
1028
 
@@ -832,10 +1032,10 @@ class ServerArgs:
832
1032
  logger.warning("Setting KV cache dtype to fp8.")
833
1033
 
834
1034
  if self.kv_cache_dtype == "fp8_e4m3":
835
- self.nsa_prefill = "flashmla_decode"
836
- self.nsa_decode = "flashmla_decode"
1035
+ self.nsa_prefill_backend = "flashmla_kv"
1036
+ self.nsa_decode_backend = "flashmla_kv"
837
1037
  logger.warning(
838
- "Setting NSA backend to flashmla_decode for FP8 KV Cache."
1038
+ "Setting NSA backend to flashmla_kv for FP8 KV Cache."
839
1039
  )
840
1040
 
841
1041
  # Logging env vars for NSA
@@ -934,6 +1134,22 @@ class ServerArgs:
934
1134
  self.enable_mixed_chunk = False
935
1135
  self.disable_radix_cache = True
936
1136
 
1137
+ if self.attention_backend == "intel_xpu":
1138
+ if self.page_size not in [32, 64, 128]:
1139
+ logger.warning(
1140
+ f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
1141
+ )
1142
+ self.page_size = 128
1143
+ if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
1144
+ raise ValueError(
1145
+ "FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
1146
+ )
1147
+ if self.prefill_attention_backend == "fa4":
1148
+ logger.warning(
1149
+ f"FA4 backend only supports page size 128, changing page_size from {self.page_size} to 128."
1150
+ )
1151
+ self.page_size = 128
1152
+
937
1153
  def _handle_page_size(self):
938
1154
  if self.page_size is None:
939
1155
  self.page_size = 1
@@ -983,7 +1199,7 @@ class ServerArgs:
983
1199
  "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
984
1200
  )
985
1201
 
986
- def _handle_deepep_moe(self):
1202
+ def _handle_a2a_moe(self):
987
1203
  if self.moe_a2a_backend == "deepep":
988
1204
  if self.deepep_mode == "normal":
989
1205
  logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
@@ -993,6 +1209,12 @@ class ServerArgs:
993
1209
  f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
994
1210
  )
995
1211
 
1212
+ if self.moe_a2a_backend == "mooncake":
1213
+ self.ep_size = self.tp_size
1214
+ logger.warning(
1215
+ f"Mooncake MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
1216
+ )
1217
+
996
1218
  def _handle_eplb_and_dispatch(self):
997
1219
  if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
998
1220
  self.expert_distribution_recorder_mode = "stat"
@@ -1008,6 +1230,15 @@ class ServerArgs:
1008
1230
  if self.enable_eplb:
1009
1231
  assert self.ep_size > 1
1010
1232
 
1233
+ def _handle_elastic_ep(self):
1234
+ if self.elastic_ep_backend is not None:
1235
+ if self.enable_eplb:
1236
+ if self.eplb_algorithm == "auto":
1237
+ self.eplb_algorithm = "elasticity_aware"
1238
+ assert (
1239
+ self.eplb_algorithm == "elasticity_aware"
1240
+ ), "Elastic EP requires eplb_algorithm to be set to 'auto' or 'elasticity_aware'."
1241
+
1011
1242
  def _handle_expert_distribution_metrics(self):
1012
1243
  if self.enable_expert_distribution_metrics and (
1013
1244
  self.expert_distribution_recorder_mode is None
@@ -1058,11 +1289,22 @@ class ServerArgs:
1058
1289
  )
1059
1290
  if self.max_running_requests is None:
1060
1291
  self.max_running_requests = 48
1061
- self.disable_overlap_schedule = True
1062
- logger.warning(
1063
- "Overlap scheduler is disabled because of using "
1064
- "eagle speculative decoding."
1065
- )
1292
+ logger.warning(
1293
+ "Max running requests is reset to 48 for speculative decoding."
1294
+ )
1295
+
1296
+ if self.speculative_algorithm == "EAGLE" and self.enable_beta_spec:
1297
+ self.disable_overlap_schedule = False
1298
+ logger.warning(
1299
+ "Beta spec is enabled for eagle speculative decoding and overlap schedule is turned on."
1300
+ )
1301
+
1302
+ if not self.enable_beta_spec:
1303
+ self.disable_overlap_schedule = True
1304
+ logger.warning(
1305
+ "Overlap scheduler is disabled because of using eagle3 and standalone speculative decoding."
1306
+ )
1307
+
1066
1308
  if self.enable_mixed_chunk:
1067
1309
  self.enable_mixed_chunk = False
1068
1310
  logger.warning(
@@ -1216,6 +1458,26 @@ class ServerArgs:
1216
1458
  "Please choose one tokenizer batching approach."
1217
1459
  )
1218
1460
 
1461
+ if self.skip_tokenizer_init:
1462
+ if self.tokenizer_worker_num != 1:
1463
+ logger.warning(
1464
+ "skip_tokenizer_init=True disables tokenizer workers; forcing tokenizer_worker_num=1 "
1465
+ f"(requested {self.tokenizer_worker_num})."
1466
+ )
1467
+ self.tokenizer_worker_num = 1
1468
+
1469
+ if self.enable_tokenizer_batch_encode:
1470
+ logger.warning(
1471
+ "skip_tokenizer_init=True ignores --enable-tokenizer-batch-encode; disabling it."
1472
+ )
1473
+ self.enable_tokenizer_batch_encode = False
1474
+
1475
+ if self.enable_dynamic_batch_tokenizer:
1476
+ logger.warning(
1477
+ "skip_tokenizer_init=True ignores --enable-dynamic-batch-tokenizer; disabling it."
1478
+ )
1479
+ self.enable_dynamic_batch_tokenizer = False
1480
+
1219
1481
  def _handle_environment_variables(self):
1220
1482
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
1221
1483
  "1" if self.enable_torch_compile else "0"
@@ -1261,13 +1523,27 @@ class ServerArgs:
1261
1523
  )
1262
1524
 
1263
1525
  # Check attention backend
1264
- if self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
1526
+ if self.attention_backend is None:
1527
+ # User didn't specify attention backend, fallback based on GPU architecture
1528
+ if is_sm100_supported() or is_sm120_supported():
1529
+ # Blackwell and newer architectures
1530
+ self.attention_backend = "flashinfer"
1531
+ else:
1532
+ # Hopper (SM90) and older architectures
1533
+ self.attention_backend = "fa3"
1534
+ logger.warning(
1535
+ f"Attention backend not specified. Falling back to '{self.attention_backend}' for deterministic inference. "
1536
+ f"You can explicitly set --attention-backend to one of {DETERMINISTIC_ATTENTION_BACKEND_CHOICES}."
1537
+ )
1538
+ elif self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
1539
+ # User explicitly specified an incompatible attention backend
1265
1540
  raise ValueError(
1266
- f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference."
1541
+ f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference, "
1542
+ f"but you explicitly specified '{self.attention_backend}'."
1267
1543
  )
1268
1544
 
1269
- # Currently, only FA3 supports radix cache. Support for other backends is in progress
1270
- if self.attention_backend != "fa3":
1545
+ # Currently, only FA3 and Triton supports radix cache. Support for other backends is in progress
1546
+ if self.attention_backend not in ["fa3", "triton"]:
1271
1547
  self.disable_radix_cache = True
1272
1548
  logger.warning(
1273
1549
  f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
@@ -1286,6 +1562,7 @@ class ServerArgs:
1286
1562
 
1287
1563
  @staticmethod
1288
1564
  def add_cli_args(parser: argparse.ArgumentParser):
1565
+
1289
1566
  # Model and tokenizer
1290
1567
  parser.add_argument(
1291
1568
  "--model-path",
@@ -1405,6 +1682,11 @@ class ServerArgs:
1405
1682
  default=ServerArgs.port,
1406
1683
  help="The port of the HTTP server.",
1407
1684
  )
1685
+ parser.add_argument(
1686
+ "--grpc-mode",
1687
+ action="store_true",
1688
+ help="If set, use gRPC server instead of HTTP server.",
1689
+ )
1408
1690
  parser.add_argument(
1409
1691
  "--skip-server-warmup",
1410
1692
  action="store_true",
@@ -1423,6 +1705,12 @@ class ServerArgs:
1423
1705
  default=ServerArgs.nccl_port,
1424
1706
  help="The port for NCCL distributed environment setup. Defaults to a random port.",
1425
1707
  )
1708
+ parser.add_argument(
1709
+ "--checkpoint-engine-wait-weights-before-ready",
1710
+ action="store_true",
1711
+ help="If set, the server will wait for initial weights to be loaded via checkpoint-engine or other update methods "
1712
+ "before serving inference requests.",
1713
+ )
1426
1714
 
1427
1715
  # Quantization and data type
1428
1716
  parser.add_argument(
@@ -1455,12 +1743,51 @@ class ServerArgs:
1455
1743
  "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
1456
1744
  "default to 1.0, which may cause accuracy issues. ",
1457
1745
  )
1746
+ parser.add_argument(
1747
+ "--modelopt-quant",
1748
+ type=str,
1749
+ default=ServerArgs.modelopt_quant,
1750
+ help="The ModelOpt quantization configuration. "
1751
+ "Supported values: 'fp8', 'int4_awq', 'w4a8_awq', 'nvfp4', 'nvfp4_awq'. "
1752
+ "This requires the NVIDIA Model Optimizer library to be installed: pip install nvidia-modelopt",
1753
+ )
1754
+ parser.add_argument(
1755
+ "--modelopt-checkpoint-restore-path",
1756
+ type=str,
1757
+ default=ServerArgs.modelopt_checkpoint_restore_path,
1758
+ help="Path to restore a previously saved ModelOpt quantized checkpoint. "
1759
+ "If provided, the quantization process will be skipped and the model "
1760
+ "will be loaded from this checkpoint.",
1761
+ )
1762
+ parser.add_argument(
1763
+ "--modelopt-checkpoint-save-path",
1764
+ type=str,
1765
+ default=ServerArgs.modelopt_checkpoint_save_path,
1766
+ help="Path to save the ModelOpt quantized checkpoint after quantization. "
1767
+ "This allows reusing the quantized model in future runs.",
1768
+ )
1769
+ parser.add_argument(
1770
+ "--modelopt-export-path",
1771
+ type=str,
1772
+ default=ServerArgs.modelopt_export_path,
1773
+ help="Path to export the quantized model in HuggingFace format after ModelOpt quantization. "
1774
+ "The exported model can then be used directly with SGLang for inference. "
1775
+ "If not provided, the model will not be exported.",
1776
+ )
1777
+ parser.add_argument(
1778
+ "--quantize-and-serve",
1779
+ action="store_true",
1780
+ default=ServerArgs.quantize_and_serve,
1781
+ help="Quantize the model with ModelOpt and immediately serve it without exporting. "
1782
+ "This is useful for development and prototyping. For production, it's recommended "
1783
+ "to use separate quantization and deployment steps.",
1784
+ )
1458
1785
  parser.add_argument(
1459
1786
  "--kv-cache-dtype",
1460
1787
  type=str,
1461
1788
  default=ServerArgs.kv_cache_dtype,
1462
- choices=["auto", "fp8_e5m2", "fp8_e4m3"],
1463
- help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
1789
+ choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
1790
+ help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
1464
1791
  )
1465
1792
  parser.add_argument(
1466
1793
  "--enable-fp32-lm-head",
@@ -1519,6 +1846,12 @@ class ServerArgs:
1519
1846
  default=ServerArgs.enable_priority_scheduling,
1520
1847
  help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
1521
1848
  )
1849
+ parser.add_argument(
1850
+ "--abort-on-priority-when-disabled",
1851
+ action="store_true",
1852
+ default=ServerArgs.abort_on_priority_when_disabled,
1853
+ help="If set, abort requests that specify a priority when priority scheduling is disabled.",
1854
+ )
1522
1855
  parser.add_argument(
1523
1856
  "--schedule-low-priority-values-first",
1524
1857
  action="store_true",
@@ -1575,6 +1908,21 @@ class ServerArgs:
1575
1908
  default=ServerArgs.device,
1576
1909
  help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
1577
1910
  )
1911
+ parser.add_argument(
1912
+ "--elastic-ep-backend",
1913
+ type=str,
1914
+ default=ServerArgs.elastic_ep_backend,
1915
+ choices=["none", "mooncake"],
1916
+ help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
1917
+ )
1918
+ parser.add_argument(
1919
+ "--mooncake-ib-device",
1920
+ type=str,
1921
+ default=ServerArgs.mooncake_ib_device,
1922
+ help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
1923
+ "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
1924
+ "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
1925
+ )
1578
1926
  parser.add_argument(
1579
1927
  "--tensor-parallel-size",
1580
1928
  "--tp-size",
@@ -1590,9 +1938,9 @@ class ServerArgs:
1590
1938
  help="The pipeline parallelism size.",
1591
1939
  )
1592
1940
  parser.add_argument(
1593
- "--max-micro-batch-size",
1941
+ "--pp-max-micro-batch-size",
1594
1942
  type=int,
1595
- default=ServerArgs.max_micro_batch_size,
1943
+ default=ServerArgs.pp_max_micro_batch_size,
1596
1944
  help="The maximum micro batch size in pipeline parallelism.",
1597
1945
  )
1598
1946
  parser.add_argument(
@@ -1616,7 +1964,12 @@ class ServerArgs:
1616
1964
  "--constrained-json-whitespace-pattern",
1617
1965
  type=str,
1618
1966
  default=ServerArgs.constrained_json_whitespace_pattern,
1619
- help="(outlines backend only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
1967
+ help="(outlines and llguidance backends only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
1968
+ )
1969
+ parser.add_argument(
1970
+ "--constrained-json-disable-any-whitespace",
1971
+ action="store_true",
1972
+ help="(xgrammar and llguidance backends only) Enforce compact representation in JSON constrained output.",
1620
1973
  )
1621
1974
  parser.add_argument(
1622
1975
  "--watchdog-timeout",
@@ -1857,6 +2210,16 @@ class ServerArgs:
1857
2210
  default=ServerArgs.tool_call_parser,
1858
2211
  help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
1859
2212
  )
2213
+ parser.add_argument(
2214
+ "--sampling-defaults",
2215
+ type=str,
2216
+ choices=["openai", "model"],
2217
+ default=ServerArgs.sampling_defaults,
2218
+ help="Where to get default sampling parameters. "
2219
+ "'openai' uses SGLang/OpenAI defaults (temperature=1.0, top_p=1.0, etc.). "
2220
+ "'model' uses the model's generation_config.json to get the recommended "
2221
+ "sampling parameters if available. Default is 'model'.",
2222
+ )
1860
2223
  parser.add_argument(
1861
2224
  "--tool-server",
1862
2225
  type=str,
@@ -1966,6 +2329,13 @@ class ServerArgs:
1966
2329
  default=ServerArgs.max_loaded_loras,
1967
2330
  help="If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`.",
1968
2331
  )
2332
+ parser.add_argument(
2333
+ "--lora-eviction-policy",
2334
+ type=str,
2335
+ default=DEFAULT_LORA_EVICTION_POLICY,
2336
+ choices=["lru", "fifo"],
2337
+ help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
2338
+ )
1969
2339
  parser.add_argument(
1970
2340
  "--lora-backend",
1971
2341
  type=str,
@@ -2025,19 +2395,20 @@ class ServerArgs:
2025
2395
  help="Set multimodal attention backend.",
2026
2396
  )
2027
2397
  parser.add_argument(
2028
- "--nsa-prefill",
2029
- default=ServerArgs.nsa_prefill,
2398
+ "--nsa-prefill-backend",
2399
+ default=ServerArgs.nsa_prefill_backend,
2030
2400
  type=str,
2031
2401
  choices=NSA_CHOICES,
2032
2402
  )
2033
2403
  parser.add_argument(
2034
- "--nsa-decode",
2035
- default=ServerArgs.nsa_decode,
2404
+ "--nsa-decode-backend",
2405
+ default=ServerArgs.nsa_decode_backend,
2036
2406
  type=str,
2037
2407
  choices=NSA_CHOICES,
2038
2408
  )
2039
2409
 
2040
2410
  # Speculative decoding
2411
+ parser.add_argument("--enable-beta-spec", action="store_true")
2041
2412
  parser.add_argument(
2042
2413
  "--speculative-algorithm",
2043
2414
  type=str,
@@ -2058,6 +2429,15 @@ class ServerArgs:
2058
2429
  "name, a tag name, or a commit id. If unspecified, will use "
2059
2430
  "the default version.",
2060
2431
  )
2432
+ parser.add_argument(
2433
+ "--speculative-draft-load-format",
2434
+ type=str,
2435
+ default=ServerArgs.speculative_draft_load_format,
2436
+ choices=LOAD_FORMAT_CHOICES,
2437
+ help="The format of the draft model weights to load. "
2438
+ "If not specified, will use the same format as --load-format. "
2439
+ "Use 'dummy' to initialize draft model weights with random values for profiling.",
2440
+ )
2061
2441
  parser.add_argument(
2062
2442
  "--speculative-num-steps",
2063
2443
  type=int,
@@ -2158,22 +2538,14 @@ class ServerArgs:
2158
2538
  parser.add_argument(
2159
2539
  "--moe-a2a-backend",
2160
2540
  type=str,
2161
- choices=["none", "deepep"],
2541
+ choices=["none", "deepep", "mooncake"],
2162
2542
  default=ServerArgs.moe_a2a_backend,
2163
2543
  help="Choose the backend for MoE A2A.",
2164
2544
  )
2165
2545
  parser.add_argument(
2166
2546
  "--moe-runner-backend",
2167
2547
  type=str,
2168
- choices=[
2169
- "auto",
2170
- "triton",
2171
- "triton_kernel",
2172
- "flashinfer_trtllm",
2173
- "flashinfer_cutlass",
2174
- "flashinfer_mxfp4",
2175
- "flashinfer_cutedsl",
2176
- ],
2548
+ choices=MOE_RUNNER_BACKEND_CHOICES,
2177
2549
  default=ServerArgs.moe_runner_backend,
2178
2550
  help="Choose the runner backend for MoE.",
2179
2551
  )
@@ -2287,6 +2659,12 @@ class ServerArgs:
2287
2659
  choices=["float32", "bfloat16"],
2288
2660
  help="The data type of the SSM states in mamba cache.",
2289
2661
  )
2662
+ parser.add_argument(
2663
+ "--mamba-full-memory-ratio",
2664
+ type=float,
2665
+ default=ServerArgs.mamba_full_memory_ratio,
2666
+ help="The ratio of mamba state memory to full kv cache memory.",
2667
+ )
2290
2668
 
2291
2669
  # Hierarchical cache
2292
2670
  parser.add_argument(
@@ -2364,6 +2742,35 @@ class ServerArgs:
2364
2742
  help="Using LMCache as an alternative hierarchical cache solution",
2365
2743
  )
2366
2744
 
2745
+ # Ktransformer server args
2746
+ parser.add_argument(
2747
+ "--kt-amx-weight-path",
2748
+ type=str,
2749
+ help="[ktransformers parameter] The path of the quantized expert weights for amx kernel. A local folder.",
2750
+ )
2751
+ parser.add_argument(
2752
+ "--kt-amx-method",
2753
+ type=str,
2754
+ default="AMXINT4",
2755
+ help="[ktransformers parameter] Quantization formats for CPU execution.",
2756
+ )
2757
+ parser.add_argument(
2758
+ "--kt-cpuinfer",
2759
+ type=int,
2760
+ help="[ktransformers parameter] The number of CPUInfer threads.",
2761
+ )
2762
+ parser.add_argument(
2763
+ "--kt-threadpool-count",
2764
+ type=int,
2765
+ default=2,
2766
+ help="[ktransformers parameter] One-to-one with the number of NUMA nodes (one thread pool per NUMA).",
2767
+ )
2768
+ parser.add_argument(
2769
+ "--kt-num-gpu-experts",
2770
+ type=int,
2771
+ help="[ktransformers parameter] The number of GPU experts.",
2772
+ )
2773
+
2367
2774
  # Double Sparsity
2368
2775
  parser.add_argument(
2369
2776
  "--enable-double-sparsity",
@@ -2433,6 +2840,14 @@ class ServerArgs:
2433
2840
  help="Mode of offloading.",
2434
2841
  )
2435
2842
 
2843
+ # Args for multi-item-scoring
2844
+ parser.add_argument(
2845
+ "--multi-item-scoring-delimiter",
2846
+ type=int,
2847
+ default=ServerArgs.multi_item_scoring_delimiter,
2848
+ help="Delimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: Query<delimiter>Item1<delimiter>Item2<delimiter>... This enables efficient batch processing of multiple items against a single query.",
2849
+ )
2850
+
2436
2851
  # Optimization/debug options
2437
2852
  parser.add_argument(
2438
2853
  "--disable-radix-cache",
@@ -2491,6 +2906,11 @@ class ServerArgs:
2491
2906
  action="store_true",
2492
2907
  help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
2493
2908
  )
2909
+ parser.add_argument(
2910
+ "--disable-tokenizer-batch-decode",
2911
+ action="store_true",
2912
+ help="Disable batch decoding when decoding multiple completions.",
2913
+ )
2494
2914
  parser.add_argument(
2495
2915
  "--disable-outlines-disk-cache",
2496
2916
  action="store_true",
@@ -2552,12 +2972,36 @@ class ServerArgs:
2552
2972
  action="store_true",
2553
2973
  help="Optimize the model with torch.compile. Experimental feature.",
2554
2974
  )
2975
+ parser.add_argument(
2976
+ "--enable-piecewise-cuda-graph",
2977
+ action="store_true",
2978
+ help="Optimize the model with piecewise cuda graph for extend/prefill only. Experimental feature.",
2979
+ )
2980
+ parser.add_argument(
2981
+ "--piecewise-cuda-graph-tokens",
2982
+ type=json_list_type,
2983
+ default=ServerArgs.piecewise_cuda_graph_tokens,
2984
+ help="Set the list of tokens when using piecewise cuda graph.",
2985
+ )
2986
+ parser.add_argument(
2987
+ "--piecewise-cuda-graph-compiler",
2988
+ type=str,
2989
+ default=ServerArgs.piecewise_cuda_graph_compiler,
2990
+ help="Set the compiler for piecewise cuda graph. Choices are: eager, inductor.",
2991
+ choices=["eager", "inductor"],
2992
+ )
2555
2993
  parser.add_argument(
2556
2994
  "--torch-compile-max-bs",
2557
2995
  type=int,
2558
2996
  default=ServerArgs.torch_compile_max_bs,
2559
2997
  help="Set the maximum batch size when using torch compile.",
2560
2998
  )
2999
+ parser.add_argument(
3000
+ "--piecewise-cuda-graph-max-tokens",
3001
+ type=int,
3002
+ default=ServerArgs.piecewise_cuda_graph_max_tokens,
3003
+ help="Set the maximum tokens when using piecewise cuda graph.",
3004
+ )
2561
3005
  parser.add_argument(
2562
3006
  "--torchao-config",
2563
3007
  type=str,
@@ -2687,11 +3131,6 @@ class ServerArgs:
2687
3131
  default=ServerArgs.debug_tensor_dump_inject,
2688
3132
  help="Inject the outputs from jax as the input of every layer.",
2689
3133
  )
2690
- parser.add_argument(
2691
- "--debug-tensor-dump-prefill-only",
2692
- action="store_true",
2693
- help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
2694
- )
2695
3134
  parser.add_argument(
2696
3135
  "--enable-dynamic-batch-tokenizer",
2697
3136
  action="store_true",
@@ -2813,6 +3252,12 @@ class ServerArgs:
2813
3252
  action="store_true",
2814
3253
  help="Enable PD-Multiplexing, PD running on greenctx stream.",
2815
3254
  )
3255
+ parser.add_argument(
3256
+ "--pdmux-config-path",
3257
+ type=str,
3258
+ default=None,
3259
+ help="The path of the PD-Multiplexing config file.",
3260
+ )
2816
3261
 
2817
3262
  parser.add_argument(
2818
3263
  "--sm-group-num",
@@ -2894,7 +3339,7 @@ class ServerArgs:
2894
3339
  self.model_path,
2895
3340
  trust_remote_code=self.trust_remote_code,
2896
3341
  revision=self.revision,
2897
- model_override_args=json.loads(self.json_model_override_args),
3342
+ model_override_args=orjson.loads(self.json_model_override_args),
2898
3343
  **kwargs,
2899
3344
  )
2900
3345
  return hf_config
@@ -2941,7 +3386,34 @@ class ServerArgs:
2941
3386
  self.chunked_prefill_size % self.page_size == 0
2942
3387
  ), "chunked_prefill_size must be divisible by page_size"
2943
3388
 
2944
- # Check multi tokenizer
3389
+ # Check pdmux
3390
+ if self.enable_pdmux:
3391
+ assert (
3392
+ self.pp_size == 1
3393
+ ), "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)."
3394
+ assert (
3395
+ self.chunked_prefill_size == -1
3396
+ ), "PD-Multiplexing is not compatible with chunked prefill."
3397
+ assert (
3398
+ self.disaggregation_mode == "null"
3399
+ ), "PD-Multiplexing is not compatible with disaggregation mode."
3400
+ assert (
3401
+ self.disable_overlap_schedule
3402
+ ), "PD-Multiplexing is not compatible with overlap schedule."
3403
+
3404
+ # NOTE: CUDA Green Context may encounter potential issues with CudaGraph on torch 2.7.x – 2.8.x, leading to performance degradation.
3405
+ import torch
3406
+
3407
+ parts = torch.__version__.split("+", 1)[0].split(".")
3408
+ major = int(parts[0]) if len(parts) > 0 and parts[0].isdigit() else 0
3409
+ minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
3410
+ if (major, minor) > (2, 6):
3411
+ logger.warning(
3412
+ "WARNING: PD-Multiplexing may experience performance degradation with torch versions > 2.6.x.\n"
3413
+ f" Current torch version is {torch.__version__}.\n"
3414
+ " Please manually install torch 2.6.x."
3415
+ )
3416
+
2945
3417
  assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
2946
3418
  self.validate_buckets_rule(
2947
3419
  "--prompt-tokens-buckets", self.prompt_tokens_buckets
@@ -2957,6 +3429,17 @@ class ServerArgs:
2957
3429
  "lof",
2958
3430
  ], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
2959
3431
 
3432
+ # Check multi-item scoring
3433
+ if self.multi_item_scoring_delimiter is not None:
3434
+ assert self.disable_radix_cache, (
3435
+ "Multi-item scoring requires radix cache to be disabled. "
3436
+ "Please set --disable-radix-cache when using --multi-item-scoring-delimiter."
3437
+ )
3438
+ assert self.chunked_prefill_size == -1, (
3439
+ "Multi-item scoring requires chunked prefill to be disabled. "
3440
+ "Please set --chunked-prefill-size -1 when using --multi-item-scoring-delimiter."
3441
+ )
3442
+
2960
3443
  def check_lora_server_args(self):
2961
3444
  assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
2962
3445
 
@@ -3141,6 +3624,22 @@ class ServerArgs:
3141
3624
  )
3142
3625
 
3143
3626
 
3627
+ # NOTE: This is a global variable to hold the server args for scheduler.
3628
+ _global_server_args: Optional[ServerArgs] = None
3629
+
3630
+
3631
+ def set_global_server_args_for_scheduler(server_args: ServerArgs):
3632
+ global _global_server_args
3633
+ _global_server_args = server_args
3634
+
3635
+
3636
+ def get_global_server_args() -> ServerArgs:
3637
+ if _global_server_args is None:
3638
+ raise ValueError("Global server args is not set yet!")
3639
+
3640
+ return _global_server_args
3641
+
3642
+
3144
3643
  def prepare_server_args(argv: List[str]) -> ServerArgs:
3145
3644
  """
3146
3645
  Prepare the server arguments from the command line arguments.
@@ -3175,11 +3674,12 @@ def prepare_server_args(argv: List[str]) -> ServerArgs:
3175
3674
  parser = argparse.ArgumentParser()
3176
3675
  ServerArgs.add_cli_args(parser)
3177
3676
  raw_args = parser.parse_args(argv)
3178
- server_args = ServerArgs.from_cli_args(raw_args)
3179
- return server_args
3677
+
3678
+ return ServerArgs.from_cli_args(raw_args)
3180
3679
 
3181
3680
 
3182
3681
  ZMQ_TCP_PORT_DELTA = 233
3682
+ DP_ATTENTION_HANDSHAKE_PORT_DELTA = 5
3183
3683
 
3184
3684
 
3185
3685
  @dataclasses.dataclass
@@ -3204,7 +3704,11 @@ class PortArgs:
3204
3704
  tokenizer_worker_ipc_name: Optional[str]
3205
3705
 
3206
3706
  @staticmethod
3207
- def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
3707
+ def init_new(
3708
+ server_args: ServerArgs,
3709
+ dp_rank: Optional[int] = None,
3710
+ worker_ports: Optional[List[int]] = None,
3711
+ ) -> PortArgs:
3208
3712
  if server_args.nccl_port is None:
3209
3713
  nccl_port = server_args.port + random.randint(100, 1000)
3210
3714
  while True:
@@ -3251,8 +3755,8 @@ class PortArgs:
3251
3755
  # TokenizerManager to DataParallelController
3252
3756
  scheduler_input_port = port_base + 4
3253
3757
  else:
3254
- scheduler_input_port = port_base + 4 + 1 + dp_rank
3255
-
3758
+ assert worker_ports is not None
3759
+ scheduler_input_port = worker_ports[dp_rank]
3256
3760
  return PortArgs(
3257
3761
  tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
3258
3762
  scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",