sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. sglang/bench_one_batch_server.py +10 -1
  2. sglang/bench_serving.py +257 -29
  3. sglang/srt/configs/__init__.py +4 -0
  4. sglang/srt/configs/device_config.py +3 -1
  5. sglang/srt/configs/dots_vlm.py +139 -0
  6. sglang/srt/configs/load_config.py +1 -0
  7. sglang/srt/configs/model_config.py +50 -6
  8. sglang/srt/configs/qwen3_next.py +326 -0
  9. sglang/srt/connector/__init__.py +8 -1
  10. sglang/srt/connector/remote_instance.py +82 -0
  11. sglang/srt/constrained/base_grammar_backend.py +48 -12
  12. sglang/srt/constrained/llguidance_backend.py +0 -1
  13. sglang/srt/constrained/outlines_backend.py +0 -1
  14. sglang/srt/constrained/xgrammar_backend.py +28 -9
  15. sglang/srt/custom_op.py +11 -1
  16. sglang/srt/debug_utils/dump_comparator.py +81 -44
  17. sglang/srt/debug_utils/dump_loader.py +97 -0
  18. sglang/srt/debug_utils/dumper.py +11 -3
  19. sglang/srt/debug_utils/text_comparator.py +73 -11
  20. sglang/srt/disaggregation/base/conn.py +1 -1
  21. sglang/srt/disaggregation/common/conn.py +15 -12
  22. sglang/srt/disaggregation/decode.py +21 -10
  23. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
  24. sglang/srt/disaggregation/fake/conn.py +1 -1
  25. sglang/srt/disaggregation/mini_lb.py +6 -445
  26. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  27. sglang/srt/disaggregation/nixl/conn.py +180 -16
  28. sglang/srt/disaggregation/prefill.py +5 -3
  29. sglang/srt/disaggregation/utils.py +5 -50
  30. sglang/srt/distributed/parallel_state.py +24 -3
  31. sglang/srt/entrypoints/engine.py +38 -17
  32. sglang/srt/entrypoints/grpc_request_manager.py +580 -0
  33. sglang/srt/entrypoints/grpc_server.py +680 -0
  34. sglang/srt/entrypoints/http_server.py +85 -54
  35. sglang/srt/entrypoints/openai/protocol.py +4 -1
  36. sglang/srt/entrypoints/openai/serving_base.py +46 -3
  37. sglang/srt/entrypoints/openai/serving_chat.py +36 -16
  38. sglang/srt/entrypoints/openai/serving_completions.py +12 -3
  39. sglang/srt/entrypoints/openai/serving_embedding.py +8 -3
  40. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  41. sglang/srt/entrypoints/openai/serving_responses.py +6 -3
  42. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  43. sglang/srt/eplb/eplb_manager.py +2 -2
  44. sglang/srt/eplb/expert_distribution.py +26 -13
  45. sglang/srt/eplb/expert_location.py +8 -3
  46. sglang/srt/eplb/expert_location_updater.py +1 -1
  47. sglang/srt/function_call/base_format_detector.py +3 -6
  48. sglang/srt/function_call/ebnf_composer.py +11 -9
  49. sglang/srt/function_call/function_call_parser.py +6 -0
  50. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  51. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  52. sglang/srt/grpc/__init__.py +1 -0
  53. sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
  54. sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
  55. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
  56. sglang/srt/hf_transformers_utils.py +4 -0
  57. sglang/srt/layers/activation.py +142 -9
  58. sglang/srt/layers/attention/ascend_backend.py +11 -4
  59. sglang/srt/layers/attention/fla/chunk.py +242 -0
  60. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  61. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  62. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  63. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  64. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  65. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  66. sglang/srt/layers/attention/fla/index.py +37 -0
  67. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  68. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  69. sglang/srt/layers/attention/fla/op.py +66 -0
  70. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  71. sglang/srt/layers/attention/fla/utils.py +331 -0
  72. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  73. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  74. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  75. sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
  76. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  77. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  78. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  79. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  80. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  81. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  82. sglang/srt/layers/attention/triton_backend.py +18 -1
  83. sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
  84. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  85. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  86. sglang/srt/layers/dp_attention.py +30 -1
  87. sglang/srt/layers/layernorm.py +32 -15
  88. sglang/srt/layers/linear.py +34 -3
  89. sglang/srt/layers/logits_processor.py +29 -10
  90. sglang/srt/layers/moe/__init__.py +2 -1
  91. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  92. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  93. sglang/srt/layers/moe/ep_moe/layer.py +182 -62
  94. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
  95. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  96. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  97. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  98. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  99. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  100. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  101. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  102. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  103. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  104. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  105. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  106. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  107. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
  108. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  109. sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
  110. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  111. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  112. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  113. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  114. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  115. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  116. sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
  117. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  118. sglang/srt/layers/moe/topk.py +30 -9
  119. sglang/srt/layers/moe/utils.py +12 -6
  120. sglang/srt/layers/quantization/awq.py +19 -7
  121. sglang/srt/layers/quantization/base_config.py +11 -6
  122. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  123. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  124. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  125. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  126. sglang/srt/layers/quantization/fp8.py +76 -47
  127. sglang/srt/layers/quantization/fp8_utils.py +50 -31
  128. sglang/srt/layers/quantization/gptq.py +25 -17
  129. sglang/srt/layers/quantization/modelopt_quant.py +147 -47
  130. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  131. sglang/srt/layers/quantization/mxfp4.py +64 -40
  132. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  133. sglang/srt/layers/quantization/unquant.py +135 -47
  134. sglang/srt/layers/quantization/w4afp8.py +30 -17
  135. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  136. sglang/srt/layers/quantization/w8a8_int8.py +76 -38
  137. sglang/srt/layers/sampler.py +162 -18
  138. sglang/srt/lora/backend/base_backend.py +50 -8
  139. sglang/srt/lora/backend/triton_backend.py +90 -2
  140. sglang/srt/lora/layers.py +32 -0
  141. sglang/srt/lora/lora.py +4 -1
  142. sglang/srt/lora/lora_manager.py +35 -112
  143. sglang/srt/lora/mem_pool.py +24 -10
  144. sglang/srt/lora/utils.py +18 -9
  145. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  146. sglang/srt/managers/cache_controller.py +158 -160
  147. sglang/srt/managers/data_parallel_controller.py +105 -35
  148. sglang/srt/managers/detokenizer_manager.py +8 -4
  149. sglang/srt/managers/disagg_service.py +46 -0
  150. sglang/srt/managers/io_struct.py +199 -12
  151. sglang/srt/managers/mm_utils.py +1 -0
  152. sglang/srt/managers/multi_tokenizer_mixin.py +350 -400
  153. sglang/srt/managers/schedule_batch.py +77 -56
  154. sglang/srt/managers/schedule_policy.py +1 -1
  155. sglang/srt/managers/scheduler.py +187 -39
  156. sglang/srt/managers/scheduler_metrics_mixin.py +4 -3
  157. sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
  158. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  159. sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
  160. sglang/srt/managers/tokenizer_manager.py +259 -519
  161. sglang/srt/managers/tp_worker.py +53 -4
  162. sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
  163. sglang/srt/mem_cache/hicache_storage.py +3 -23
  164. sglang/srt/mem_cache/hiradix_cache.py +103 -43
  165. sglang/srt/mem_cache/memory_pool.py +347 -48
  166. sglang/srt/mem_cache/memory_pool_host.py +105 -46
  167. sglang/srt/mem_cache/radix_cache.py +0 -2
  168. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  169. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  170. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +86 -4
  171. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  172. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  173. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +49 -7
  174. sglang/srt/mem_cache/swa_radix_cache.py +0 -2
  175. sglang/srt/metrics/collector.py +493 -76
  176. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  177. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  178. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  179. sglang/srt/model_executor/forward_batch_info.py +59 -2
  180. sglang/srt/model_executor/model_runner.py +356 -29
  181. sglang/srt/model_loader/__init__.py +9 -3
  182. sglang/srt/model_loader/loader.py +128 -4
  183. sglang/srt/model_loader/weight_utils.py +2 -1
  184. sglang/srt/models/apertus.py +686 -0
  185. sglang/srt/models/bailing_moe.py +798 -218
  186. sglang/srt/models/bailing_moe_nextn.py +168 -0
  187. sglang/srt/models/deepseek_v2.py +109 -15
  188. sglang/srt/models/dots_vlm.py +174 -0
  189. sglang/srt/models/dots_vlm_vit.py +337 -0
  190. sglang/srt/models/ernie4.py +1 -1
  191. sglang/srt/models/gemma3n_mm.py +1 -1
  192. sglang/srt/models/glm4_moe.py +1 -1
  193. sglang/srt/models/glm4v.py +4 -2
  194. sglang/srt/models/glm4v_moe.py +3 -0
  195. sglang/srt/models/gpt_oss.py +1 -1
  196. sglang/srt/models/llama4.py +9 -0
  197. sglang/srt/models/llama_eagle3.py +13 -0
  198. sglang/srt/models/longcat_flash.py +2 -2
  199. sglang/srt/models/mllama4.py +25 -0
  200. sglang/srt/models/opt.py +637 -0
  201. sglang/srt/models/qwen2.py +7 -0
  202. sglang/srt/models/qwen2_5_vl.py +27 -3
  203. sglang/srt/models/qwen2_moe.py +56 -12
  204. sglang/srt/models/qwen3_moe.py +1 -1
  205. sglang/srt/models/qwen3_next.py +1042 -0
  206. sglang/srt/models/qwen3_next_mtp.py +112 -0
  207. sglang/srt/models/step3_vl.py +1 -1
  208. sglang/srt/multimodal/processors/dots_vlm.py +99 -0
  209. sglang/srt/multimodal/processors/glm4v.py +9 -9
  210. sglang/srt/multimodal/processors/internvl.py +141 -129
  211. sglang/srt/multimodal/processors/qwen_vl.py +15 -5
  212. sglang/srt/offloader.py +27 -3
  213. sglang/srt/remote_instance_weight_loader_utils.py +69 -0
  214. sglang/srt/sampling/sampling_batch_info.py +18 -15
  215. sglang/srt/server_args.py +276 -35
  216. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  217. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  218. sglang/srt/speculative/eagle_utils.py +0 -2
  219. sglang/srt/speculative/eagle_worker.py +43 -4
  220. sglang/srt/speculative/spec_info.py +5 -0
  221. sglang/srt/speculative/standalone_worker.py +109 -0
  222. sglang/srt/tracing/trace.py +552 -0
  223. sglang/srt/utils.py +34 -3
  224. sglang/srt/weight_sync/utils.py +1 -1
  225. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  226. sglang/test/runners.py +4 -0
  227. sglang/test/test_cutlass_moe.py +24 -6
  228. sglang/test/test_disaggregation_utils.py +66 -0
  229. sglang/test/test_fp4_moe.py +370 -1
  230. sglang/test/test_utils.py +28 -1
  231. sglang/utils.py +11 -0
  232. sglang/version.py +1 -1
  233. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
  234. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +237 -178
  235. sglang/srt/disaggregation/launch_lb.py +0 -118
  236. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
  237. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
  238. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -19,10 +19,12 @@ import json
19
19
  import logging
20
20
  import os
21
21
  import random
22
+ import socket
22
23
  import sys
23
24
  import tempfile
24
25
  from typing import List, Literal, Optional, Union
25
26
 
27
+ from sglang.srt.connector import ConnectorType
26
28
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
27
29
  from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
28
30
  from sglang.srt.lora.lora_registry import LoRARef
@@ -36,14 +38,18 @@ from sglang.srt.utils import (
36
38
  is_cuda,
37
39
  is_flashinfer_available,
38
40
  is_hip,
41
+ is_npu,
39
42
  is_port_available,
40
43
  is_remote_url,
41
44
  is_sm90_supported,
42
45
  is_sm100_supported,
43
46
  is_triton_kernels_available,
44
47
  is_valid_ipv6_address,
48
+ json_list_type,
45
49
  nullable_str,
50
+ parse_connector_type,
46
51
  )
52
+ from sglang.utils import is_in_ci
47
53
 
48
54
  logger = logging.getLogger(__name__)
49
55
 
@@ -60,6 +66,7 @@ LOAD_FORMAT_CHOICES = [
60
66
  "bitsandbytes",
61
67
  "layered",
62
68
  "remote",
69
+ "remote_instance",
63
70
  ]
64
71
 
65
72
  QUANTIZATION_CHOICES = [
@@ -94,6 +101,7 @@ ATTENTION_BACKEND_CHOICES = [
94
101
  "trtllm_mla",
95
102
  "trtllm_mha",
96
103
  "dual_chunk_flash_attn",
104
+ "hybrid_linear_attn",
97
105
  # AMD specific
98
106
  "aiter",
99
107
  "wave",
@@ -104,6 +112,8 @@ ATTENTION_BACKEND_CHOICES = [
104
112
 
105
113
  DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
106
114
 
115
+ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
116
+
107
117
 
108
118
  # Allow external code to add more choices
109
119
  def add_load_format_choices(choices):
@@ -122,6 +132,10 @@ def add_disagg_transfer_backend_choices(choices):
122
132
  DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
123
133
 
124
134
 
135
+ def add_grammar_backend_choices(choices):
136
+ GRAMMAR_BACKEND_CHOICES.extend(choices)
137
+
138
+
125
139
  @dataclasses.dataclass
126
140
  class ServerArgs:
127
141
  # Model and tokenizer
@@ -191,6 +205,8 @@ class ServerArgs:
191
205
  show_time_cost: bool = False
192
206
  enable_metrics: bool = False
193
207
  enable_metrics_for_all_schedulers: bool = False
208
+ tokenizer_metrics_custom_labels_header: str = "x-customer-labels"
209
+ tokenizer_metrics_allowed_customer_labels: Optional[List[str]] = None
194
210
  bucket_time_to_first_token: Optional[List[float]] = None
195
211
  bucket_inter_token_latency: Optional[List[float]] = None
196
212
  bucket_e2e_request_latency: Optional[List[float]] = None
@@ -201,6 +217,8 @@ class ServerArgs:
201
217
  enable_request_time_stats_logging: bool = False
202
218
  kv_events_config: Optional[str] = None
203
219
  gc_warning_threshold_secs: float = 0.0
220
+ enable_trace: bool = False
221
+ oltp_traces_endpoint: str = "localhost:4317"
204
222
 
205
223
  # API related
206
224
  api_key: Optional[str] = None
@@ -217,6 +235,9 @@ class ServerArgs:
217
235
  # Data parallelism
218
236
  dp_size: int = 1
219
237
  load_balance_method: str = "round_robin"
238
+ load_watch_interval: float = 0.1
239
+ # FIXME: remove this after dp rank scheduling is fully supported with PD-Disaggregation
240
+ prefill_round_robin_balance: bool = False
220
241
 
221
242
  # Multi-node distributed serving
222
243
  dist_init_addr: Optional[str] = None
@@ -249,12 +270,14 @@ class ServerArgs:
249
270
  # Speculative decoding
250
271
  speculative_algorithm: Optional[str] = None
251
272
  speculative_draft_model_path: Optional[str] = None
273
+ speculative_draft_model_revision: Optional[str] = None
252
274
  speculative_num_steps: Optional[int] = None
253
275
  speculative_eagle_topk: Optional[int] = None
254
276
  speculative_num_draft_tokens: Optional[int] = None
255
277
  speculative_accept_threshold_single: float = 1.0
256
278
  speculative_accept_threshold_acc: float = 1.0
257
279
  speculative_token_map: Optional[str] = None
280
+ speculative_attention_mode: str = "prefill"
258
281
 
259
282
  # Expert parallelism
260
283
  ep_size: int = 1
@@ -296,6 +319,8 @@ class ServerArgs:
296
319
  hicache_storage_backend: Optional[str] = None
297
320
  hicache_storage_prefetch_policy: str = "best_effort"
298
321
  hicache_storage_backend_extra_config: Optional[str] = None
322
+ # LMCache
323
+ enable_lmcache: bool = False
299
324
 
300
325
  # Double Sparsity
301
326
  enable_double_sparsity: bool = False
@@ -340,6 +365,7 @@ class ServerArgs:
340
365
  enable_p2p_check: bool = False
341
366
  triton_attention_reduce_in_fp32: bool = False
342
367
  triton_attention_num_kv_splits: int = 8
368
+ triton_attention_split_tile_size: Optional[int] = None
343
369
  num_continuous_decode_steps: int = 1
344
370
  delete_ckpt_after_loading: bool = False
345
371
  enable_memory_saver: bool = False
@@ -351,6 +377,12 @@ class ServerArgs:
351
377
  disable_fast_image_processor: bool = False
352
378
  enable_return_hidden_states: bool = False
353
379
  scheduler_recv_interval: int = 1
380
+ numa_node: Optional[List[int]] = None
381
+
382
+ # Dynamic batch tokenizer
383
+ enable_dynamic_batch_tokenizer: bool = False
384
+ dynamic_batch_tokenizer_batch_size: int = 32
385
+ dynamic_batch_tokenizer_batch_timeout: float = 0.002
354
386
 
355
387
  # Debug tensor dumps
356
388
  debug_tensor_dump_output_folder: Optional[str] = None
@@ -359,7 +391,7 @@ class ServerArgs:
359
391
  debug_tensor_dump_prefill_only: bool = False
360
392
 
361
393
  # PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
362
- disaggregation_mode: str = "null"
394
+ disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
363
395
  disaggregation_transfer_backend: str = "mooncake"
364
396
  disaggregation_bootstrap_port: int = 8998
365
397
  disaggregation_decode_tp: Optional[int] = None
@@ -367,20 +399,32 @@ class ServerArgs:
367
399
  disaggregation_prefill_pp: Optional[int] = 1
368
400
  disaggregation_ib_device: Optional[str] = None
369
401
  num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
370
- pdlb_url: Optional[str] = None
402
+
403
+ # FIXME: hack to reduce ITL when decode bs is small
404
+ disaggregation_decode_polling_interval: int = 1
371
405
 
372
406
  # For model weight update
373
407
  custom_weight_loader: Optional[List[str]] = None
374
408
  weight_loader_disable_mmap: bool = False
375
409
 
410
+ # Remote instance weight loading
411
+ remote_instance_weight_loader_seed_instance_ip: Optional[str] = None
412
+ remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
413
+ remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None
414
+
376
415
  # For PD-Multiplexing
377
416
  enable_pdmux: bool = False
378
417
  sm_group_num: int = 3
379
418
 
419
+ # Mamba cache
420
+ max_mamba_cache_size: Optional[int] = None
421
+ mamba_ssm_dtype: str = "float32"
422
+
380
423
  # Deprecated arguments
381
424
  enable_ep_moe: bool = False
382
425
  enable_deepep_moe: bool = False
383
426
  enable_flashinfer_cutlass_moe: bool = False
427
+ enable_flashinfer_cutedsl_moe: bool = False
384
428
  enable_flashinfer_trtllm_moe: bool = False
385
429
  enable_triton_kernel_moe: bool = False
386
430
  enable_flashinfer_mxfp4_moe: bool = False
@@ -402,6 +446,11 @@ class ServerArgs:
402
446
  print_deprecated_warning(
403
447
  "NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
404
448
  )
449
+ if self.enable_flashinfer_cutedsl_moe:
450
+ self.moe_runner_backend = "flashinfer_cutedsl"
451
+ print_deprecated_warning(
452
+ "NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead."
453
+ )
405
454
  if self.enable_flashinfer_cutlass_moe:
406
455
  self.moe_runner_backend = "flashinfer_cutlass"
407
456
  print_deprecated_warning(
@@ -421,6 +470,7 @@ class ServerArgs:
421
470
  # Set missing default values
422
471
  if self.tokenizer_path is None:
423
472
  self.tokenizer_path = self.model_path
473
+
424
474
  if self.served_model_name is None:
425
475
  self.served_model_name = self.model_path
426
476
  if self.device is None:
@@ -464,9 +514,14 @@ class ServerArgs:
464
514
  # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
465
515
  reserved_mem = 32 * 1024
466
516
 
517
+ # draft model and larger cuda graph buffers
467
518
  if self.speculative_algorithm is not None:
468
- # draft model and larger cuda graph buffers
469
- reserved_mem += 2 * 1024
519
+ if self.speculative_algorithm == "STANDALONE":
520
+ # Standalone speculative decoding needs more memory than other speculative
521
+ # decoding algorithms since the draft model is typically larger.
522
+ reserved_mem += 6 * 1024
523
+ else:
524
+ reserved_mem += 2 * 1024
470
525
  if self.enable_dp_attention:
471
526
  reserved_mem += 4 * 1024
472
527
 
@@ -509,7 +564,8 @@ class ServerArgs:
509
564
  self.sampling_backend = "pytorch"
510
565
 
511
566
  # Model-specific adjustments
512
- self.model_specific_adjustments()
567
+ if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
568
+ self.model_specific_adjustments()
513
569
 
514
570
  # Set kernel backends
515
571
  if self.device == "cpu":
@@ -528,7 +584,7 @@ class ServerArgs:
528
584
  )
529
585
  self.disable_cuda_graph = True
530
586
 
531
- if self.attention_backend == "ascend":
587
+ if is_npu() and self.attention_backend in ["ascend", "hybrid_linear_attn"]:
532
588
  logger.warning(
533
589
  "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
534
590
  )
@@ -608,12 +664,13 @@ class ServerArgs:
608
664
  if self.grammar_backend is None:
609
665
  self.grammar_backend = "xgrammar"
610
666
 
667
+ if self.dp_size == 1:
668
+ self.enable_dp_attention = False
669
+ self.enable_dp_lm_head = False
670
+
611
671
  # Data parallelism attention
612
672
  if self.enable_dp_attention:
613
673
  self.schedule_conservativeness = self.schedule_conservativeness * 0.3
614
- assert (
615
- self.dp_size > 1
616
- ), "Please set a dp-size > 1. You can use 1 < dp-size <= tp-size "
617
674
  assert self.tp_size % self.dp_size == 0
618
675
  self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
619
676
  logger.warning(
@@ -636,11 +693,13 @@ class ServerArgs:
636
693
  ], "The expert parallel size must be 1 or the same as the tensor parallel size"
637
694
 
638
695
  if self.moe_runner_backend == "flashinfer_trtllm":
639
- if not self.disable_shared_experts_fusion:
640
- self.disable_shared_experts_fusion = True
641
- logger.warning(
642
- "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
643
- )
696
+ assert (
697
+ self.quantization == "modelopt_fp4" or self.quantization == "fp8"
698
+ ), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
699
+ self.disable_shared_experts_fusion = True
700
+ logger.warning(
701
+ "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
702
+ )
644
703
 
645
704
  # DeepEP MoE
646
705
  if self.moe_a2a_backend == "deepep":
@@ -690,12 +749,24 @@ class ServerArgs:
690
749
  self.hicache_io_backend = "kernel"
691
750
  self.hicache_mem_layout = "page_first"
692
751
 
752
+ if self.hicache_mem_layout == "page_first_direct":
753
+ if self.hicache_io_backend != "direct":
754
+ self.hicache_io_backend = "direct"
755
+ logger.warning(
756
+ "Page first direct layout only support direct io backend"
757
+ )
758
+
693
759
  # Speculative Decoding
694
760
  if self.speculative_algorithm == "NEXTN":
695
761
  # NEXTN shares the same implementation of EAGLE
696
762
  self.speculative_algorithm = "EAGLE"
697
763
 
698
- if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
764
+ if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
765
+ if self.speculative_algorithm == "STANDALONE":
766
+ # TODO: support dp attention for standalone speculative decoding
767
+ assert (
768
+ self.enable_dp_attention is False
769
+ ), "Currently standalone speculative decoding does not support dp attention."
699
770
  if self.max_running_requests is None:
700
771
  self.max_running_requests = 48
701
772
  self.disable_overlap_schedule = True
@@ -711,7 +782,12 @@ class ServerArgs:
711
782
  )
712
783
 
713
784
  model_arch = self.get_hf_config().architectures[0]
714
- if model_arch in ["DeepseekV3ForCausalLM", "Glm4MoeForCausalLM"]:
785
+ if model_arch in [
786
+ "DeepseekV3ForCausalLM",
787
+ "Glm4MoeForCausalLM",
788
+ "BailingMoeForCausalLM",
789
+ "BailingMoeV2ForCausalLM",
790
+ ]:
715
791
  # Auto set draft_model_path DeepSeek-V3/R1
716
792
  if self.speculative_draft_model_path is None:
717
793
  self.speculative_draft_model_path = self.model_path
@@ -770,12 +846,19 @@ class ServerArgs:
770
846
  ) and check_gguf_file(self.model_path):
771
847
  self.quantization = self.load_format = "gguf"
772
848
 
773
- # Model loading
774
849
  if is_remote_url(self.model_path):
775
850
  self.load_format = "remote"
776
851
  if self.custom_weight_loader is None:
777
852
  self.custom_weight_loader = []
778
853
 
854
+ if self.load_format == "remote_instance":
855
+ if (
856
+ self.remote_instance_weight_loader_seed_instance_ip is None
857
+ or self.remote_instance_weight_loader_seed_instance_service_port is None
858
+ or self.remote_instance_weight_loader_send_weights_group_ports is None
859
+ ):
860
+ self.load_format = "auto"
861
+
779
862
  # PD disaggregation
780
863
  if self.disaggregation_mode == "decode":
781
864
  assert (
@@ -787,6 +870,13 @@ class ServerArgs:
787
870
 
788
871
  self.disable_radix_cache = True
789
872
  logger.warning("KV cache is forced as chunk cache for decode server")
873
+
874
+ if self.dp_size > 1 and not is_in_ci():
875
+ assert self.prefill_round_robin_balance, (
876
+ "Prefill round robin balance is required when dp size > 1. "
877
+ "Please make sure that the prefill instance is launched with `--load-balance-method round_robin`"
878
+ " and `--prefill-round-robin-balance` is set for decode server."
879
+ )
790
880
  elif self.disaggregation_mode == "prefill":
791
881
  if self.disaggregation_decode_tp is None:
792
882
  self.disaggregation_decode_tp = self.tp_size
@@ -799,10 +889,19 @@ class ServerArgs:
799
889
  self.disable_cuda_graph = True
800
890
  logger.warning("Cuda graph is disabled for prefill server")
801
891
 
892
+ # Validation: prevent both tokenizer batching features from being enabled
893
+ if self.enable_tokenizer_batch_encode and self.enable_dynamic_batch_tokenizer:
894
+ raise ValueError(
895
+ "Cannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. "
896
+ "Please choose one tokenizer batching approach."
897
+ )
898
+
802
899
  # Propagate env vars
803
900
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
804
901
  "1" if self.enable_torch_compile else "0"
805
902
  )
903
+ os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
904
+
806
905
  # Set env var before grammar backends init
807
906
  os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
808
907
  "1" if self.disable_outlines_disk_cache else "0"
@@ -814,6 +913,14 @@ class ServerArgs:
814
913
  "and cannot be used at the same time. Please use only one of them."
815
914
  )
816
915
 
916
+ if (
917
+ not self.tokenizer_metrics_custom_labels_header
918
+ and self.tokenizer_metrics_allowed_customer_labels
919
+ ):
920
+ raise ValueError(
921
+ "Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-customer-labels."
922
+ )
923
+
817
924
  @staticmethod
818
925
  def add_cli_args(parser: argparse.ArgumentParser):
819
926
  # Model and tokenizer
@@ -825,16 +932,28 @@ class ServerArgs:
825
932
  required=True,
826
933
  )
827
934
  parser.add_argument(
828
- "--tokenizer-path",
935
+ "--remote-instance-weight-loader-seed-instance-ip",
829
936
  type=str,
830
- default=ServerArgs.tokenizer_path,
831
- help="The path of the tokenizer.",
937
+ default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
938
+ help="The ip of the seed instance for loading weights from remote instance.",
832
939
  )
833
940
  parser.add_argument(
834
- "--tokenizer-worker-num",
941
+ "--remote-instance-weight-loader-seed-instance-service-port",
835
942
  type=int,
836
- default=ServerArgs.tokenizer_worker_num,
837
- help="The worker num of the tokenizer manager.",
943
+ default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
944
+ help="The service port of the seed instance for loading weights from remote instance.",
945
+ )
946
+ parser.add_argument(
947
+ "--remote-instance-weight-loader-send-weights-group-ports",
948
+ type=json_list_type,
949
+ default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
950
+ help="The communication group ports for loading weights from remote instance.",
951
+ )
952
+ parser.add_argument(
953
+ "--tokenizer-path",
954
+ type=str,
955
+ default=ServerArgs.tokenizer_path,
956
+ help="The path of the tokenizer.",
838
957
  )
839
958
  parser.add_argument(
840
959
  "--tokenizer-mode",
@@ -845,6 +964,12 @@ class ServerArgs:
845
964
  "tokenizer if available, and 'slow' will "
846
965
  "always use the slow tokenizer.",
847
966
  )
967
+ parser.add_argument(
968
+ "--tokenizer-worker-num",
969
+ type=int,
970
+ default=ServerArgs.tokenizer_worker_num,
971
+ help="The worker num of the tokenizer manager.",
972
+ )
848
973
  parser.add_argument(
849
974
  "--skip-tokenizer-init",
850
975
  action="store_true",
@@ -1035,7 +1160,7 @@ class ServerArgs:
1035
1160
  "--schedule-policy",
1036
1161
  type=str,
1037
1162
  default=ServerArgs.schedule_policy,
1038
- choices=["lpm", "random", "fcfs", "dfs-weight", "lof"],
1163
+ choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
1039
1164
  help="The scheduling policy of the requests.",
1040
1165
  )
1041
1166
  parser.add_argument(
@@ -1209,6 +1334,21 @@ class ServerArgs:
1209
1334
  "to record request metrics separately. This is especially useful when dp_attention is enabled, as "
1210
1335
  "otherwise all metrics appear to come from TP 0.",
1211
1336
  )
1337
+ parser.add_argument(
1338
+ "--tokenizer-metrics-custom-labels-header",
1339
+ type=str,
1340
+ default=ServerArgs.tokenizer_metrics_custom_labels_header,
1341
+ help="Specify the HTTP header for passing customer labels for tokenizer metrics.",
1342
+ )
1343
+ parser.add_argument(
1344
+ "--tokenizer-metrics-allowed-customer-labels",
1345
+ type=str,
1346
+ nargs="+",
1347
+ default=ServerArgs.tokenizer_metrics_allowed_customer_labels,
1348
+ help="The customer labels allowed for tokenizer metrics. The labels are specified via a dict in "
1349
+ "'--tokenizer-metrics-custom-labels-header' field in HTTP requests, e.g., {'label1': 'value1', 'label2': "
1350
+ "'value2'} is allowed if '--tokenizer-metrics-allowed-labels label1 label2' is set.",
1351
+ )
1212
1352
  parser.add_argument(
1213
1353
  "--bucket-time-to-first-token",
1214
1354
  type=float,
@@ -1280,6 +1420,17 @@ class ServerArgs:
1280
1420
  default=None,
1281
1421
  help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
1282
1422
  )
1423
+ parser.add_argument(
1424
+ "--enable-trace",
1425
+ action="store_true",
1426
+ help="Enable opentelemetry trace",
1427
+ )
1428
+ parser.add_argument(
1429
+ "--oltp-traces-endpoint",
1430
+ type=str,
1431
+ default="localhost:4317",
1432
+ help="Config opentelemetry collector endpoint if --enable-trace is set. format: <ip>:<port>",
1433
+ )
1283
1434
 
1284
1435
  # API related
1285
1436
  parser.add_argument(
@@ -1364,6 +1515,18 @@ class ServerArgs:
1364
1515
  "minimum_tokens",
1365
1516
  ],
1366
1517
  )
1518
+ parser.add_argument(
1519
+ "--load-watch-interval",
1520
+ type=float,
1521
+ default=ServerArgs.load_watch_interval,
1522
+ help="The interval of load watching in seconds.",
1523
+ )
1524
+ parser.add_argument(
1525
+ "--prefill-round-robin-balance",
1526
+ default=ServerArgs.prefill_round_robin_balance,
1527
+ action="store_true",
1528
+ help="Prefill is round robin balanced. This is used to promise decode server can get the correct dp rank.",
1529
+ )
1367
1530
 
1368
1531
  # Multi-node distributed serving
1369
1532
  parser.add_argument(
@@ -1474,7 +1637,7 @@ class ServerArgs:
1474
1637
  parser.add_argument(
1475
1638
  "--grammar-backend",
1476
1639
  type=str,
1477
- choices=["xgrammar", "outlines", "llguidance", "none"],
1640
+ choices=GRAMMAR_BACKEND_CHOICES,
1478
1641
  default=ServerArgs.grammar_backend,
1479
1642
  help="Choose the backend for grammar-guided decoding.",
1480
1643
  )
@@ -1490,14 +1653,23 @@ class ServerArgs:
1490
1653
  parser.add_argument(
1491
1654
  "--speculative-algorithm",
1492
1655
  type=str,
1493
- choices=["EAGLE", "EAGLE3", "NEXTN"],
1656
+ choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
1494
1657
  help="Speculative algorithm.",
1495
1658
  )
1496
1659
  parser.add_argument(
1497
1660
  "--speculative-draft-model-path",
1661
+ "--speculative-draft-model",
1498
1662
  type=str,
1499
1663
  help="The path of the draft model weights. This can be a local folder or a Hugging Face repo ID.",
1500
1664
  )
1665
+ parser.add_argument(
1666
+ "--speculative-draft-model-revision",
1667
+ type=str,
1668
+ default=None,
1669
+ help="The specific draft model version to use. It can be a branch "
1670
+ "name, a tag name, or a commit id. If unspecified, will use "
1671
+ "the default version.",
1672
+ )
1501
1673
  parser.add_argument(
1502
1674
  "--speculative-num-steps",
1503
1675
  type=int,
@@ -1534,6 +1706,13 @@ class ServerArgs:
1534
1706
  help="The path of the draft model's small vocab table.",
1535
1707
  default=ServerArgs.speculative_token_map,
1536
1708
  )
1709
+ parser.add_argument(
1710
+ "--speculative-attention-mode",
1711
+ type=str,
1712
+ choices=["prefill", "decode"],
1713
+ help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
1714
+ default=ServerArgs.speculative_attention_mode,
1715
+ )
1537
1716
 
1538
1717
  # Expert parallelism
1539
1718
  parser.add_argument(
@@ -1561,6 +1740,7 @@ class ServerArgs:
1561
1740
  "flashinfer_trtllm",
1562
1741
  "flashinfer_cutlass",
1563
1742
  "flashinfer_mxfp4",
1743
+ "flashinfer_cutedsl",
1564
1744
  ],
1565
1745
  default=ServerArgs.moe_runner_backend,
1566
1746
  help="Choose the runner backend for MoE.",
@@ -1568,7 +1748,7 @@ class ServerArgs:
1568
1748
  parser.add_argument(
1569
1749
  "--flashinfer-mxfp4-moe-precision",
1570
1750
  type=str,
1571
- choices=["mxfp4", "bf16"],
1751
+ choices=["default", "bf16"],
1572
1752
  default=ServerArgs.flashinfer_mxfp4_moe_precision,
1573
1753
  help="Choose the computation precision of flashinfer mxfp4 moe",
1574
1754
  )
@@ -1661,6 +1841,21 @@ class ServerArgs:
1661
1841
  help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
1662
1842
  )
1663
1843
 
1844
+ # Mamba Cache
1845
+ parser.add_argument(
1846
+ "--max-mamba-cache-size",
1847
+ type=int,
1848
+ default=ServerArgs.max_mamba_cache_size,
1849
+ help="The maximum size of the mamba cache.",
1850
+ )
1851
+ parser.add_argument(
1852
+ "--mamba-ssm-dtype",
1853
+ type=str,
1854
+ default=ServerArgs.mamba_ssm_dtype,
1855
+ choices=["float32", "bfloat16"],
1856
+ help="The data type of the SSM states in mamba cache.",
1857
+ )
1858
+
1664
1859
  # Hierarchical cache
1665
1860
  parser.add_argument(
1666
1861
  "--enable-hierarchical-cache",
@@ -1696,7 +1891,7 @@ class ServerArgs:
1696
1891
  parser.add_argument(
1697
1892
  "--hicache-mem-layout",
1698
1893
  type=str,
1699
- choices=["layer_first", "page_first"],
1894
+ choices=["layer_first", "page_first", "page_first_direct"],
1700
1895
  default=ServerArgs.hicache_mem_layout,
1701
1896
  help="The layout of host memory pool for hierarchical cache.",
1702
1897
  )
@@ -1720,6 +1915,12 @@ class ServerArgs:
1720
1915
  default=ServerArgs.hicache_storage_backend_extra_config,
1721
1916
  help="A dictionary in JSON string format containing extra configuration for the storage backend.",
1722
1917
  )
1918
+ # LMCache
1919
+ parser.add_argument(
1920
+ "--enable-lmcache",
1921
+ action="store_true",
1922
+ help="Using LMCache as an alternative hierarchical cache solution",
1923
+ )
1723
1924
 
1724
1925
  # Double Sparsity
1725
1926
  parser.add_argument(
@@ -1933,6 +2134,12 @@ class ServerArgs:
1933
2134
  default=ServerArgs.triton_attention_num_kv_splits,
1934
2135
  help="The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8.",
1935
2136
  )
2137
+ parser.add_argument(
2138
+ "--triton-attention-split-tile-size",
2139
+ type=int,
2140
+ default=ServerArgs.triton_attention_split_tile_size,
2141
+ help="The size of split KV tile in flash decoding Triton kernel. Used for deterministic inference.",
2142
+ )
1936
2143
  parser.add_argument(
1937
2144
  "--num-continuous-decode-steps",
1938
2145
  type=int,
@@ -1992,6 +2199,12 @@ class ServerArgs:
1992
2199
  default=ServerArgs.scheduler_recv_interval,
1993
2200
  help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
1994
2201
  )
2202
+ parser.add_argument(
2203
+ "--numa-node",
2204
+ type=int,
2205
+ nargs="+",
2206
+ help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
2207
+ )
1995
2208
 
1996
2209
  # Debug tensor dumps
1997
2210
  parser.add_argument(
@@ -2017,12 +2230,29 @@ class ServerArgs:
2017
2230
  action="store_true",
2018
2231
  help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
2019
2232
  )
2233
+ parser.add_argument(
2234
+ "--enable-dynamic-batch-tokenizer",
2235
+ action="store_true",
2236
+ help="Enable async dynamic batch tokenizer for improved performance when multiple requests arrive concurrently.",
2237
+ )
2238
+ parser.add_argument(
2239
+ "--dynamic-batch-tokenizer-batch-size",
2240
+ type=int,
2241
+ default=ServerArgs.dynamic_batch_tokenizer_batch_size,
2242
+ help="[Only used if --enable-dynamic-batch-tokenizer is set] Maximum batch size for dynamic batch tokenizer.",
2243
+ )
2244
+ parser.add_argument(
2245
+ "--dynamic-batch-tokenizer-batch-timeout",
2246
+ type=float,
2247
+ default=ServerArgs.dynamic_batch_tokenizer_batch_timeout,
2248
+ help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
2249
+ )
2020
2250
 
2021
2251
  # PD disaggregation
2022
2252
  parser.add_argument(
2023
2253
  "--disaggregation-mode",
2024
2254
  type=str,
2025
- default="null",
2255
+ default=ServerArgs.disaggregation_mode,
2026
2256
  choices=["null", "prefill", "decode"],
2027
2257
  help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
2028
2258
  )
@@ -2072,10 +2302,10 @@ class ServerArgs:
2072
2302
  help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
2073
2303
  )
2074
2304
  parser.add_argument(
2075
- "--pdlb-url",
2076
- type=str,
2077
- default=None,
2078
- help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
2305
+ "--disaggregation-decode-polling-interval",
2306
+ type=int,
2307
+ default=ServerArgs.disaggregation_decode_polling_interval,
2308
+ help="The interval to poll requests in decode server. Can be set to >1 to reduce the overhead of this.",
2079
2309
  )
2080
2310
 
2081
2311
  # Custom weight loader
@@ -2122,6 +2352,11 @@ class ServerArgs:
2122
2352
  action="store_true",
2123
2353
  help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
2124
2354
  )
2355
+ parser.add_argument(
2356
+ "--enable-flashinfer-cutedsl-moe",
2357
+ action="store_true",
2358
+ help="(Deprecated) Enable FlashInfer CuteDSL MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
2359
+ )
2125
2360
  parser.add_argument(
2126
2361
  "--enable-flashinfer-trtllm-moe",
2127
2362
  action="store_true",
@@ -2144,6 +2379,7 @@ class ServerArgs:
2144
2379
  args.pp_size = args.pipeline_parallel_size
2145
2380
  args.dp_size = args.data_parallel_size
2146
2381
  args.ep_size = args.expert_parallel_size
2382
+
2147
2383
  attrs = [attr.name for attr in dataclasses.fields(cls)]
2148
2384
  return cls(**{attr: getattr(args, attr) for attr in attrs})
2149
2385
 
@@ -2200,7 +2436,8 @@ class ServerArgs:
2200
2436
 
2201
2437
  # Check chunked prefill
2202
2438
  # Skip validation if chunked prefill is disabled (i.e., size <= 0).
2203
- if self.chunked_prefill_size > 0:
2439
+ # Skip validation if disaggregation mode is decode.
2440
+ if self.chunked_prefill_size > 0 and self.disaggregation_mode != "decode":
2204
2441
  assert (
2205
2442
  self.chunked_prefill_size % self.page_size == 0
2206
2443
  ), "chunked_prefill_size must be divisible by page_size"
@@ -2612,7 +2849,9 @@ def auto_choose_speculative_params(self: ServerArgs):
2612
2849
  """
2613
2850
  hf_config = self.get_hf_config()
2614
2851
  arch = hf_config.architectures[0]
2615
-
2852
+ if self.speculative_algorithm == "STANDALONE":
2853
+ # The default value for standalone speculative decoding
2854
+ return (3, 1, 4)
2616
2855
  if arch in ["LlamaForCausalLM"]:
2617
2856
  # The default value for llama
2618
2857
  return (5, 4, 8)
@@ -2620,6 +2859,8 @@ def auto_choose_speculative_params(self: ServerArgs):
2620
2859
  "DeepseekV3ForCausalLM",
2621
2860
  "DeepseekV2ForCausalLM",
2622
2861
  "GptOssForCausalLM",
2862
+ "BailingMoeForCausalLM",
2863
+ "BailingMoeV2ForCausalLM",
2623
2864
  ]:
2624
2865
  # The default value for deepseek and gpt-oss
2625
2866
  return (3, 1, 4)