sglang 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_serving.py +1 -1
  4. sglang/lang/interpreter.py +40 -1
  5. sglang/lang/ir.py +27 -0
  6. sglang/math_utils.py +8 -0
  7. sglang/srt/configs/model_config.py +6 -0
  8. sglang/srt/conversation.py +6 -0
  9. sglang/srt/disaggregation/base/__init__.py +1 -1
  10. sglang/srt/disaggregation/base/conn.py +25 -11
  11. sglang/srt/disaggregation/common/__init__.py +5 -1
  12. sglang/srt/disaggregation/common/utils.py +42 -0
  13. sglang/srt/disaggregation/decode.py +196 -51
  14. sglang/srt/disaggregation/fake/__init__.py +1 -1
  15. sglang/srt/disaggregation/fake/conn.py +15 -9
  16. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  17. sglang/srt/disaggregation/mooncake/conn.py +18 -13
  18. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  19. sglang/srt/disaggregation/nixl/conn.py +17 -12
  20. sglang/srt/disaggregation/prefill.py +128 -43
  21. sglang/srt/disaggregation/utils.py +127 -123
  22. sglang/srt/entrypoints/engine.py +15 -1
  23. sglang/srt/entrypoints/http_server.py +13 -2
  24. sglang/srt/eplb_simulator/__init__.py +1 -0
  25. sglang/srt/eplb_simulator/reader.py +51 -0
  26. sglang/srt/layers/activation.py +19 -0
  27. sglang/srt/layers/attention/aiter_backend.py +15 -2
  28. sglang/srt/layers/attention/cutlass_mla_backend.py +38 -15
  29. sglang/srt/layers/attention/flashattention_backend.py +53 -64
  30. sglang/srt/layers/attention/flashinfer_backend.py +1 -2
  31. sglang/srt/layers/attention/flashinfer_mla_backend.py +22 -24
  32. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  33. sglang/srt/layers/attention/triton_backend.py +119 -119
  34. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  35. sglang/srt/layers/attention/vision.py +51 -24
  36. sglang/srt/layers/communicator.py +23 -5
  37. sglang/srt/layers/linear.py +0 -4
  38. sglang/srt/layers/logits_processor.py +0 -12
  39. sglang/srt/layers/moe/ep_moe/kernels.py +6 -5
  40. sglang/srt/layers/moe/ep_moe/layer.py +42 -32
  41. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
  42. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -4
  43. sglang/srt/layers/moe/topk.py +16 -8
  44. sglang/srt/layers/pooler.py +56 -0
  45. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  46. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
  47. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  48. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  49. sglang/srt/layers/quantization/fp8_kernel.py +44 -15
  50. sglang/srt/layers/quantization/fp8_utils.py +87 -22
  51. sglang/srt/layers/radix_attention.py +2 -3
  52. sglang/srt/lora/lora_manager.py +79 -34
  53. sglang/srt/lora/mem_pool.py +4 -5
  54. sglang/srt/managers/cache_controller.py +2 -1
  55. sglang/srt/managers/io_struct.py +28 -4
  56. sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
  57. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  58. sglang/srt/managers/schedule_batch.py +39 -6
  59. sglang/srt/managers/scheduler.py +73 -17
  60. sglang/srt/managers/tokenizer_manager.py +29 -2
  61. sglang/srt/mem_cache/chunk_cache.py +1 -0
  62. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  63. sglang/srt/mem_cache/memory_pool.py +111 -407
  64. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  65. sglang/srt/mem_cache/radix_cache.py +36 -12
  66. sglang/srt/model_executor/cuda_graph_runner.py +122 -55
  67. sglang/srt/model_executor/forward_batch_info.py +14 -5
  68. sglang/srt/model_executor/model_runner.py +6 -6
  69. sglang/srt/model_loader/loader.py +8 -1
  70. sglang/srt/models/bert.py +113 -13
  71. sglang/srt/models/deepseek_v2.py +113 -155
  72. sglang/srt/models/internvl.py +46 -102
  73. sglang/srt/models/roberta.py +117 -9
  74. sglang/srt/models/vila.py +305 -0
  75. sglang/srt/openai_api/adapter.py +162 -4
  76. sglang/srt/openai_api/protocol.py +37 -1
  77. sglang/srt/sampling/sampling_batch_info.py +24 -0
  78. sglang/srt/sampling/sampling_params.py +2 -0
  79. sglang/srt/server_args.py +318 -233
  80. sglang/srt/speculative/build_eagle_tree.py +1 -1
  81. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -3
  82. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +5 -2
  83. sglang/srt/speculative/eagle_utils.py +389 -109
  84. sglang/srt/speculative/eagle_worker.py +134 -43
  85. sglang/srt/two_batch_overlap.py +4 -2
  86. sglang/srt/utils.py +58 -0
  87. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  88. sglang/test/runners.py +38 -3
  89. sglang/test/test_block_fp8.py +1 -0
  90. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  91. sglang/test/test_block_fp8_ep.py +1 -0
  92. sglang/test/test_utils.py +3 -1
  93. sglang/utils.py +9 -0
  94. sglang/version.py +1 -1
  95. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +5 -5
  96. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +99 -88
  97. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +0 -0
  98. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  99. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -28,7 +28,6 @@ from sglang.srt.utils import (
28
28
  configure_ipv6,
29
29
  get_device,
30
30
  get_device_memory_capacity,
31
- is_cuda,
32
31
  is_flashinfer_available,
33
32
  is_hip,
34
33
  is_port_available,
@@ -91,6 +90,7 @@ class ServerArgs:
91
90
  download_dir: Optional[str] = None
92
91
  base_gpu_id: int = 0
93
92
  gpu_id_step: int = 1
93
+ sleep_on_idle: bool = False
94
94
 
95
95
  # Logging
96
96
  log_level: str = "info"
@@ -112,14 +112,12 @@ class ServerArgs:
112
112
  file_storage_path: str = "sglang_storage"
113
113
  enable_cache_report: bool = False
114
114
  reasoning_parser: Optional[str] = None
115
+ tool_call_parser: Optional[str] = None
115
116
 
116
117
  # Data parallelism
117
118
  dp_size: int = 1
118
119
  load_balance_method: str = "round_robin"
119
120
 
120
- # Expert parallelism
121
- ep_size: int = 1
122
-
123
121
  # Multi-node distributed serving
124
122
  dist_init_addr: Optional[str] = None
125
123
  nnodes: int = 1
@@ -138,6 +136,7 @@ class ServerArgs:
138
136
  attention_backend: Optional[str] = None
139
137
  sampling_backend: Optional[str] = None
140
138
  grammar_backend: Optional[str] = None
139
+ mm_attention_backend: Optional[str] = None
141
140
 
142
141
  # Speculative decoding
143
142
  speculative_algorithm: Optional[str] = None
@@ -149,6 +148,26 @@ class ServerArgs:
149
148
  speculative_accept_threshold_acc: float = 1.0
150
149
  speculative_token_map: Optional[str] = None
151
150
 
151
+ # Expert parallelism
152
+ ep_size: int = 1
153
+ enable_ep_moe: bool = False
154
+ enable_deepep_moe: bool = False
155
+ deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
156
+ ep_num_redundant_experts: int = 0
157
+ ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
158
+ init_expert_location: str = "trivial"
159
+ enable_eplb: bool = False
160
+ eplb_algorithm: str = "auto"
161
+ eplb_rebalance_num_iterations: int = 1000
162
+ eplb_rebalance_layers_per_chunk: Optional[int] = None
163
+ expert_distribution_recorder_mode: Optional[
164
+ Literal["stat", "stat_approx", "per_pass", "per_token"]
165
+ ] = None
166
+ expert_distribution_recorder_buffer_size: Optional[int] = None
167
+ enable_expert_distribution_metrics: bool = False
168
+ deepep_config: Optional[str] = None
169
+ moe_dense_tp_size: Optional[int] = None
170
+
152
171
  # Double Sparsity
153
172
  enable_double_sparsity: bool = False
154
173
  ds_channel_config_path: Optional[str] = None
@@ -159,38 +178,24 @@ class ServerArgs:
159
178
 
160
179
  # Optimization/debug options
161
180
  disable_radix_cache: bool = False
181
+ cuda_graph_max_bs: Optional[int] = None
182
+ cuda_graph_bs: Optional[List[int]] = None
162
183
  disable_cuda_graph: bool = False
163
184
  disable_cuda_graph_padding: bool = False
185
+ enable_profile_cuda_graph: bool = False
164
186
  enable_nccl_nvls: bool = False
165
187
  enable_tokenizer_batch_encode: bool = False
166
188
  disable_outlines_disk_cache: bool = False
167
189
  disable_custom_all_reduce: bool = False
168
190
  enable_mscclpp: bool = False
169
191
  disable_overlap_schedule: bool = False
192
+ disable_overlap_cg_plan: bool = False
170
193
  enable_mixed_chunk: bool = False
171
194
  enable_dp_attention: bool = False
172
195
  enable_dp_lm_head: bool = False
173
196
  enable_two_batch_overlap: bool = False
174
- enable_ep_moe: bool = False
175
- enable_deepep_moe: bool = False
176
- deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
177
- ep_num_redundant_experts: int = 0
178
- ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
179
- init_expert_location: str = "trivial"
180
- enable_eplb: bool = False
181
- eplb_algorithm: str = "auto"
182
- eplb_rebalance_num_iterations: int = 1000
183
- eplb_rebalance_layers_per_chunk: Optional[int] = None
184
- expert_distribution_recorder_mode: Optional[
185
- Literal["stat", "stat_approx", "per_pass", "per_token"]
186
- ] = None
187
- expert_distribution_recorder_buffer_size: Optional[int] = None
188
- enable_expert_distribution_metrics: bool = False
189
- deepep_config: Optional[str] = None
190
197
  enable_torch_compile: bool = False
191
198
  torch_compile_max_bs: int = 32
192
- cuda_graph_max_bs: Optional[int] = None
193
- cuda_graph_bs: Optional[List[int]] = None
194
199
  torchao_config: str = ""
195
200
  enable_nan_detection: bool = False
196
201
  enable_p2p_check: bool = False
@@ -201,29 +206,32 @@ class ServerArgs:
201
206
  enable_memory_saver: bool = False
202
207
  allow_auto_truncate: bool = False
203
208
  enable_custom_logit_processor: bool = False
204
- tool_call_parser: Optional[str] = None
205
209
  enable_hierarchical_cache: bool = False
206
210
  hicache_ratio: float = 2.0
207
211
  hicache_size: int = 0
208
212
  hicache_write_policy: str = "write_through_selective"
209
213
  flashinfer_mla_disable_ragged: bool = False
210
- warmups: Optional[str] = None
211
- moe_dense_tp_size: Optional[int] = None
212
214
  disable_shared_experts_fusion: bool = False
213
215
  disable_chunked_prefix_cache: bool = False
214
216
  disable_fast_image_processor: bool = False
215
- mm_attention_backend: Optional[str] = None
217
+ enable_return_hidden_states: bool = False
218
+ warmups: Optional[str] = None
216
219
 
217
220
  # Debug tensor dumps
218
221
  debug_tensor_dump_output_folder: Optional[str] = None
219
222
  debug_tensor_dump_input_file: Optional[str] = None
220
223
  debug_tensor_dump_inject: bool = False
224
+ debug_tensor_dump_prefill_only: bool = False
221
225
 
222
226
  # For PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
223
227
  disaggregation_mode: str = "null"
224
- disaggregation_bootstrap_port: int = 8998
225
228
  disaggregation_transfer_backend: str = "mooncake"
229
+ disaggregation_bootstrap_port: int = 8998
230
+ disaggregation_decode_tp: Optional[int] = None
231
+ disaggregation_decode_dp: Optional[int] = None
232
+ disaggregation_prefill_pp: Optional[int] = 1
226
233
  disaggregation_ib_device: Optional[str] = None
234
+ num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
227
235
  pdlb_url: Optional[str] = None
228
236
 
229
237
  def __post_init__(self):
@@ -249,51 +257,72 @@ class ServerArgs:
249
257
 
250
258
  gpu_mem = get_device_memory_capacity(self.device)
251
259
 
252
- # Set mem fraction static, which depends on the tensor parallelism size
260
+ # Set mem fraction static
253
261
  if self.mem_fraction_static is None:
254
- parallel_size = self.tp_size * self.pp_size
255
- if gpu_mem is not None and gpu_mem <= 81920:
256
- if parallel_size >= 16:
257
- self.mem_fraction_static = 0.79
258
- elif parallel_size >= 8:
259
- self.mem_fraction_static = 0.81
260
- elif parallel_size >= 4:
261
- self.mem_fraction_static = 0.85
262
- elif parallel_size >= 2:
263
- self.mem_fraction_static = 0.87
262
+ if gpu_mem is not None:
263
+ # GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
264
+ # mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
265
+
266
+ # We want mem_fraction_static to be as large as possible but still has enough room
267
+ # for activations and cuda graph buffers. We use the following heuristic to
268
+ # compute the needed size for activations and cuda graph buffers:
269
+ # - The size of the activation depends on the chunked_prefill_size and model size.
270
+ # - The size of cuda graph buffers depends on the cuda graph capture range and model size.
271
+ # For GPUs with more memory, we use a larger chunked_prefill_size and
272
+ # capture more cuda graphs, so they need to reserve more memory.
273
+ parallel_size = self.tp_size * self.pp_size
274
+
275
+ if gpu_mem < 20 * 1024:
276
+ # T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
277
+ reserved_mem = (2.8 + parallel_size / 10) * 1024
278
+ elif gpu_mem < 35 * 1024:
279
+ # A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
280
+ reserved_mem = (2.8 + parallel_size / 10) * 1024
281
+ elif gpu_mem < 90 * 1024:
282
+ # H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
283
+ reserved_mem = (9.5 + parallel_size / 2) * 1024
284
+ elif gpu_mem < 100 * 1024:
285
+ # H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
286
+ reserved_mem = (12 + parallel_size / 2) * 1024
287
+ elif gpu_mem < 160 * 1024:
288
+ # H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
289
+ reserved_mem = (12 + parallel_size / 2) * 1024
264
290
  else:
265
- self.mem_fraction_static = 0.88
266
- else:
267
- self.mem_fraction_static = 0.88
268
- if gpu_mem is not None and gpu_mem > 180 * 1000 and is_cuda():
269
- self.mem_fraction_static = 0.79
270
- elif gpu_mem is not None and gpu_mem > 96 * 1024:
271
- mem_fraction = self.mem_fraction_static
272
- # 15 GB + additional 3GB for cuda graph
273
- reserve_mem = 1024 * 18
274
- # need reserve more memory for spec cuda graph
291
+ # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
292
+ reserved_mem = 32 * 1024
293
+
275
294
  if self.speculative_algorithm is not None:
276
- reserve_mem = 1024 * 20
277
- self.mem_fraction_static = min(
278
- mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
279
- (gpu_mem - reserve_mem) / gpu_mem,
280
- )
295
+ # draft model and larger cuda graph buffers
296
+ reserved_mem += 2 * 1024
297
+ if self.enable_dp_attention:
298
+ reserved_mem += 4 * 1024
299
+
300
+ self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
281
301
  else:
282
- if self.speculative_algorithm is not None:
283
- self.mem_fraction_static *= 0.95
302
+ self.mem_fraction_static = 0.88
284
303
 
285
304
  # Set chunked prefill size, which depends on the gpu memory capacity
286
305
  if self.chunked_prefill_size is None:
287
- if gpu_mem is not None and gpu_mem > 180_000:
288
- self.chunked_prefill_size = 16384
289
- elif gpu_mem is not None and gpu_mem < 25_000:
290
- self.chunked_prefill_size = 2048
291
- elif self.disaggregation_mode != "null":
292
- self.chunked_prefill_size = 16384
306
+ if gpu_mem is not None:
307
+ if gpu_mem < 35 * 1024: # A10, L40, 4090
308
+ self.chunked_prefill_size = 2048
309
+ elif gpu_mem < 160 * 1024: # H100, H200, A100, H20
310
+ self.chunked_prefill_size = 8192
311
+ else: # B200, MI300
312
+ self.chunked_prefill_size = 16384
293
313
  else:
294
- self.chunked_prefill_size = 8192
314
+ self.chunked_prefill_size = 4096
295
315
  assert self.chunked_prefill_size % self.page_size == 0
296
316
 
317
+ # Set cuda graph max batch size
318
+ if self.cuda_graph_max_bs is None:
319
+ # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
320
+ if gpu_mem is not None and gpu_mem < 35 * 1024:
321
+ if self.tp_size < 4:
322
+ self.cuda_graph_max_bs = 8
323
+ else:
324
+ self.cuda_graph_max_bs = 80
325
+
297
326
  assert self.moe_dense_tp_size in {
298
327
  1,
299
328
  None,
@@ -311,15 +340,6 @@ class ServerArgs:
311
340
  )
312
341
  self.page_size = 128
313
342
 
314
- # Set cuda graph max batch size
315
- if self.cuda_graph_max_bs is None:
316
- # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
317
- if gpu_mem is not None and gpu_mem < 25_000:
318
- if self.tp_size < 4:
319
- self.cuda_graph_max_bs = 8
320
- else:
321
- self.cuda_graph_max_bs = 80
322
-
323
343
  # Set kernel backends for hpu device
324
344
  if self.device == "hpu":
325
345
  self.attention_backend = "torch_native"
@@ -390,7 +410,7 @@ class ServerArgs:
390
410
  if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
391
411
  self.expert_distribution_recorder_mode = "stat"
392
412
  logger.info(
393
- f"EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
413
+ "EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
394
414
  )
395
415
 
396
416
  if (self.enable_eplb or (self.init_expert_location is not None)) and (
@@ -398,7 +418,7 @@ class ServerArgs:
398
418
  ):
399
419
  self.ep_dispatch_algorithm = "static"
400
420
  logger.info(
401
- f"EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
421
+ "EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
402
422
  )
403
423
 
404
424
  if self.enable_expert_distribution_metrics and (
@@ -488,12 +508,27 @@ class ServerArgs:
488
508
  self.triton_attention_num_kv_splits = 16
489
509
 
490
510
  # PD disaggregation
491
- if self.disaggregation_mode == "prefill":
492
- self.disable_cuda_graph = True
493
- logger.warning("Cuda graph is disabled for prefill server")
494
- elif self.disaggregation_mode == "decode":
511
+ if self.disaggregation_mode == "decode":
512
+ assert (
513
+ self.disaggregation_decode_tp is None
514
+ ), "Cannot set --disaggregation-decode-tp for the decode engine."
515
+ assert (
516
+ self.disaggregation_decode_dp is None
517
+ ), "Cannot set --disaggregation-decode-dp for the decode engine."
518
+
495
519
  self.disable_radix_cache = True
496
520
  logger.warning("KV cache is forced as chunk cache for decode server")
521
+ elif self.disaggregation_mode == "prefill":
522
+ if self.disaggregation_decode_tp is None:
523
+ self.disaggregation_decode_tp = self.tp_size
524
+ if self.disaggregation_decode_dp is None:
525
+ self.disaggregation_decode_dp = self.dp_size
526
+
527
+ self.disaggregation_prefill_pp = self.pp_size
528
+ self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
529
+
530
+ self.disable_cuda_graph = True
531
+ logger.warning("Cuda graph is disabled for prefill server")
497
532
 
498
533
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
499
534
  "1" if self.enable_torch_compile else "0"
@@ -503,6 +538,14 @@ class ServerArgs:
503
538
  "1" if self.disable_outlines_disk_cache else "0"
504
539
  )
505
540
 
541
+ def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
542
+ larger_tp = max(decode_tp, prefill_tp)
543
+ smaller_tp = min(decode_tp, prefill_tp)
544
+ assert larger_tp % smaller_tp == 0, (
545
+ "Different tp size is supported only when one tp is multiple of the other. "
546
+ f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
547
+ )
548
+
506
549
  @staticmethod
507
550
  def add_cli_args(parser: argparse.ArgumentParser):
508
551
  # Model and port args
@@ -519,10 +562,16 @@ class ServerArgs:
519
562
  help="The path of the tokenizer.",
520
563
  )
521
564
  parser.add_argument(
522
- "--host", type=str, default=ServerArgs.host, help="The host of the server."
565
+ "--host",
566
+ type=str,
567
+ default=ServerArgs.host,
568
+ help="The host of the HTTP server.",
523
569
  )
524
570
  parser.add_argument(
525
- "--port", type=int, default=ServerArgs.port, help="The port of the server."
571
+ "--port",
572
+ type=int,
573
+ default=ServerArgs.port,
574
+ help="The port of the HTTP server.",
526
575
  )
527
576
  parser.add_argument(
528
577
  "--tokenizer-mode",
@@ -677,6 +726,18 @@ class ServerArgs:
677
726
  "name, a tag name, or a commit id. If unspecified, will use "
678
727
  "the default version.",
679
728
  )
729
+ parser.add_argument(
730
+ "--impl",
731
+ type=str,
732
+ default=ServerArgs.impl,
733
+ help="Which implementation of the model to use.\n\n"
734
+ '* "auto" will try to use the SGLang implementation if it exists '
735
+ "and fall back to the Transformers implementation if no SGLang "
736
+ "implementation is available.\n"
737
+ '* "sglang" will use the SGLang model implementation.\n'
738
+ '* "transformers" will use the Transformers model '
739
+ "implementation.\n",
740
+ )
680
741
 
681
742
  # Memory and scheduling
682
743
  parser.add_argument(
@@ -735,18 +796,6 @@ class ServerArgs:
735
796
  default=ServerArgs.page_size,
736
797
  help="The number of tokens in a page.",
737
798
  )
738
- parser.add_argument(
739
- "--impl",
740
- type=str,
741
- default=ServerArgs.impl,
742
- help="Which implementation of the model to use.\n\n"
743
- '* "auto" will try to use the SGLang implementation if it exists '
744
- "and fall back to the Transformers implementation if no SGLang "
745
- "implementation is available.\n"
746
- '* "sglang" will use the SGLang model implementation.\n'
747
- '* "transformers" will use the Transformers model '
748
- "implementation.\n",
749
- )
750
799
 
751
800
  # Other runtime options
752
801
  parser.add_argument(
@@ -822,6 +871,11 @@ class ServerArgs:
822
871
  default=ServerArgs.gpu_id_step,
823
872
  help="The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...",
824
873
  )
874
+ parser.add_argument(
875
+ "--sleep-on-idle",
876
+ action="store_true",
877
+ help="Reduce CPU usage when sglang is idle.",
878
+ )
825
879
 
826
880
  # Logging
827
881
  parser.add_argument(
@@ -929,6 +983,13 @@ class ServerArgs:
929
983
  default=ServerArgs.reasoning_parser,
930
984
  help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
931
985
  )
986
+ parser.add_argument(
987
+ "--tool-call-parser",
988
+ type=str,
989
+ choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
990
+ default=ServerArgs.tool_call_parser,
991
+ help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
992
+ )
932
993
 
933
994
  # Data parallelism
934
995
  parser.add_argument(
@@ -949,15 +1010,6 @@ class ServerArgs:
949
1010
  ],
950
1011
  )
951
1012
 
952
- # Expert parallelism
953
- parser.add_argument(
954
- "--expert-parallel-size",
955
- "--ep-size",
956
- type=int,
957
- default=ServerArgs.ep_size,
958
- help="The expert parallelism size.",
959
- )
960
-
961
1013
  # Multi-node distributed serving
962
1014
  parser.add_argument(
963
1015
  "--dist-init-addr",
@@ -1038,21 +1090,6 @@ class ServerArgs:
1038
1090
  default=ServerArgs.grammar_backend,
1039
1091
  help="Choose the backend for grammar-guided decoding.",
1040
1092
  )
1041
- parser.add_argument(
1042
- "--enable-flashinfer-mla",
1043
- action=DeprecatedAction,
1044
- help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
1045
- )
1046
- parser.add_argument(
1047
- "--enable-flashmla",
1048
- action=DeprecatedAction,
1049
- help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
1050
- )
1051
- parser.add_argument(
1052
- "--flashinfer-mla-disable-ragged",
1053
- action="store_true",
1054
- help="Not using ragged prefill wrapper when running flashinfer mla",
1055
- )
1056
1093
 
1057
1094
  # Speculative decoding
1058
1095
  parser.add_argument(
@@ -1102,6 +1139,109 @@ class ServerArgs:
1102
1139
  help="The path of the draft model's small vocab table.",
1103
1140
  default=ServerArgs.speculative_token_map,
1104
1141
  )
1142
+ parser.add_argument(
1143
+ "--mm-attention-backend",
1144
+ type=str,
1145
+ choices=["sdpa", "fa3", "triton_attn"],
1146
+ default=ServerArgs.mm_attention_backend,
1147
+ help="Set multimodal attention backend.",
1148
+ )
1149
+
1150
+ # Expert parallelism
1151
+ parser.add_argument(
1152
+ "--expert-parallel-size",
1153
+ "--ep-size",
1154
+ type=int,
1155
+ default=ServerArgs.ep_size,
1156
+ help="The expert parallelism size.",
1157
+ )
1158
+ parser.add_argument(
1159
+ "--enable-ep-moe",
1160
+ action="store_true",
1161
+ help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
1162
+ )
1163
+ parser.add_argument(
1164
+ "--enable-deepep-moe",
1165
+ action="store_true",
1166
+ help="Enabling DeepEP MoE implementation for EP MoE.",
1167
+ )
1168
+ parser.add_argument(
1169
+ "--deepep-mode",
1170
+ type=str,
1171
+ choices=["normal", "low_latency", "auto"],
1172
+ default="auto",
1173
+ help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
1174
+ )
1175
+ parser.add_argument(
1176
+ "--ep-num-redundant-experts",
1177
+ type=int,
1178
+ default=ServerArgs.ep_num_redundant_experts,
1179
+ help="Allocate this number of redundant experts in expert parallel.",
1180
+ )
1181
+ parser.add_argument(
1182
+ "--ep-dispatch-algorithm",
1183
+ type=str,
1184
+ default=ServerArgs.ep_dispatch_algorithm,
1185
+ help="The algorithm to choose ranks for redundant experts in expert parallel.",
1186
+ )
1187
+ parser.add_argument(
1188
+ "--init-expert-location",
1189
+ type=str,
1190
+ default=ServerArgs.init_expert_location,
1191
+ help="Initial location of EP experts.",
1192
+ )
1193
+ parser.add_argument(
1194
+ "--enable-eplb",
1195
+ action="store_true",
1196
+ help="Enable EPLB algorithm",
1197
+ )
1198
+ parser.add_argument(
1199
+ "--eplb-algorithm",
1200
+ type=str,
1201
+ default=ServerArgs.eplb_algorithm,
1202
+ help="Chosen EPLB algorithm",
1203
+ )
1204
+ parser.add_argument(
1205
+ "--eplb-rebalance-num-iterations",
1206
+ type=int,
1207
+ default=ServerArgs.eplb_rebalance_num_iterations,
1208
+ help="Number of iterations to automatically trigger a EPLB re-balance.",
1209
+ )
1210
+ parser.add_argument(
1211
+ "--eplb-rebalance-layers-per-chunk",
1212
+ type=int,
1213
+ default=ServerArgs.eplb_rebalance_layers_per_chunk,
1214
+ help="Number of layers to rebalance per forward pass.",
1215
+ )
1216
+ parser.add_argument(
1217
+ "--expert-distribution-recorder-mode",
1218
+ type=str,
1219
+ default=ServerArgs.expert_distribution_recorder_mode,
1220
+ help="Mode of expert distribution recorder.",
1221
+ )
1222
+ parser.add_argument(
1223
+ "--expert-distribution-recorder-buffer-size",
1224
+ type=int,
1225
+ default=ServerArgs.expert_distribution_recorder_buffer_size,
1226
+ help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
1227
+ )
1228
+ parser.add_argument(
1229
+ "--enable-expert-distribution-metrics",
1230
+ action="store_true",
1231
+ help="Enable logging metrics for expert balancedness",
1232
+ )
1233
+ parser.add_argument(
1234
+ "--deepep-config",
1235
+ type=str,
1236
+ default=ServerArgs.deepep_config,
1237
+ help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
1238
+ )
1239
+ parser.add_argument(
1240
+ "--moe-dense-tp-size",
1241
+ type=int,
1242
+ default=ServerArgs.moe_dense_tp_size,
1243
+ help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
1244
+ )
1105
1245
 
1106
1246
  # Double Sparsity
1107
1247
  parser.add_argument(
@@ -1146,6 +1286,18 @@ class ServerArgs:
1146
1286
  action="store_true",
1147
1287
  help="Disable RadixAttention for prefix caching.",
1148
1288
  )
1289
+ parser.add_argument(
1290
+ "--cuda-graph-max-bs",
1291
+ type=int,
1292
+ default=ServerArgs.cuda_graph_max_bs,
1293
+ help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
1294
+ )
1295
+ parser.add_argument(
1296
+ "--cuda-graph-bs",
1297
+ type=int,
1298
+ nargs="+",
1299
+ help="Set the list of batch sizes for cuda graph.",
1300
+ )
1149
1301
  parser.add_argument(
1150
1302
  "--disable-cuda-graph",
1151
1303
  action="store_true",
@@ -1156,6 +1308,11 @@ class ServerArgs:
1156
1308
  action="store_true",
1157
1309
  help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
1158
1310
  )
1311
+ parser.add_argument(
1312
+ "--enable-profile-cuda-graph",
1313
+ action="store_true",
1314
+ help="Enable profiling of cuda graph capture.",
1315
+ )
1159
1316
  parser.add_argument(
1160
1317
  "--enable-nccl-nvls",
1161
1318
  action="store_true",
@@ -1186,6 +1343,11 @@ class ServerArgs:
1186
1343
  action="store_true",
1187
1344
  help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
1188
1345
  )
1346
+ parser.add_argument(
1347
+ "--disable-overlap-cg-plan",
1348
+ action="store_true",
1349
+ help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
1350
+ )
1189
1351
  parser.add_argument(
1190
1352
  "--enable-mixed-chunk",
1191
1353
  action="store_true",
@@ -1201,11 +1363,6 @@ class ServerArgs:
1201
1363
  action="store_true",
1202
1364
  help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
1203
1365
  )
1204
- parser.add_argument(
1205
- "--enable-ep-moe",
1206
- action="store_true",
1207
- help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
1208
- )
1209
1366
  parser.add_argument(
1210
1367
  "--enable-two-batch-overlap",
1211
1368
  action="store_true",
@@ -1222,18 +1379,6 @@ class ServerArgs:
1222
1379
  default=ServerArgs.torch_compile_max_bs,
1223
1380
  help="Set the maximum batch size when using torch compile.",
1224
1381
  )
1225
- parser.add_argument(
1226
- "--cuda-graph-max-bs",
1227
- type=int,
1228
- default=ServerArgs.cuda_graph_max_bs,
1229
- help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
1230
- )
1231
- parser.add_argument(
1232
- "--cuda-graph-bs",
1233
- type=int,
1234
- nargs="+",
1235
- help="Set the list of batch sizes for cuda graph.",
1236
- )
1237
1382
  parser.add_argument(
1238
1383
  "--torchao-config",
1239
1384
  type=str,
@@ -1290,13 +1435,6 @@ class ServerArgs:
1290
1435
  action="store_true",
1291
1436
  help="Enable users to pass custom logit processors to the server (disabled by default for security)",
1292
1437
  )
1293
- parser.add_argument(
1294
- "--tool-call-parser",
1295
- type=str,
1296
- choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
1297
- default=ServerArgs.tool_call_parser,
1298
- help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
1299
- )
1300
1438
  parser.add_argument(
1301
1439
  "--enable-hierarchical-cache",
1302
1440
  action="store_true",
@@ -1322,86 +1460,9 @@ class ServerArgs:
1322
1460
  help="The write policy of hierarchical cache.",
1323
1461
  )
1324
1462
  parser.add_argument(
1325
- "--enable-deepep-moe",
1326
- action="store_true",
1327
- help="Enabling DeepEP MoE implementation for EP MoE.",
1328
- )
1329
- parser.add_argument(
1330
- "--moe-dense-tp-size",
1331
- type=int,
1332
- default=ServerArgs.moe_dense_tp_size,
1333
- help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
1334
- )
1335
- parser.add_argument(
1336
- "--deepep-mode",
1337
- type=str,
1338
- choices=["normal", "low_latency", "auto"],
1339
- default="auto",
1340
- help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
1341
- )
1342
- parser.add_argument(
1343
- "--ep-num-redundant-experts",
1344
- type=int,
1345
- default=ServerArgs.ep_num_redundant_experts,
1346
- help="Allocate this number of redundant experts in expert parallel.",
1347
- )
1348
- parser.add_argument(
1349
- "--ep-dispatch-algorithm",
1350
- type=str,
1351
- default=ServerArgs.ep_dispatch_algorithm,
1352
- help="The algorithm to choose ranks for redundant experts in expert parallel.",
1353
- )
1354
- parser.add_argument(
1355
- "--init-expert-location",
1356
- type=str,
1357
- default=ServerArgs.init_expert_location,
1358
- help="Initial location of EP experts.",
1359
- )
1360
- parser.add_argument(
1361
- "--enable-eplb",
1362
- action="store_true",
1363
- help="Enable EPLB algorithm",
1364
- )
1365
- parser.add_argument(
1366
- "--eplb-algorithm",
1367
- type=str,
1368
- default=ServerArgs.eplb_algorithm,
1369
- help="Chosen EPLB algorithm",
1370
- )
1371
- parser.add_argument(
1372
- "--eplb-rebalance-num-iterations",
1373
- type=int,
1374
- default=ServerArgs.eplb_rebalance_num_iterations,
1375
- help="Number of iterations to automatically trigger a EPLB re-balance.",
1376
- )
1377
- parser.add_argument(
1378
- "--eplb-rebalance-layers-per-chunk",
1379
- type=int,
1380
- default=ServerArgs.eplb_rebalance_layers_per_chunk,
1381
- help="Number of layers to rebalance per forward pass.",
1382
- )
1383
- parser.add_argument(
1384
- "--expert-distribution-recorder-mode",
1385
- type=str,
1386
- default=ServerArgs.expert_distribution_recorder_mode,
1387
- help="Mode of expert distribution recorder.",
1388
- )
1389
- parser.add_argument(
1390
- "--expert-distribution-recorder-buffer-size",
1391
- type=int,
1392
- default=ServerArgs.expert_distribution_recorder_buffer_size,
1393
- help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
1394
- )
1395
- parser.add_argument(
1396
- "--enable-expert-distribution-metrics",
1463
+ "--flashinfer-mla-disable-ragged",
1397
1464
  action="store_true",
1398
- help="Enable logging metrics for expert balancedness",
1399
- )
1400
- parser.add_argument(
1401
- "--deepep-config",
1402
- type=str,
1403
- default=ServerArgs.deepep_config,
1404
- help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
1465
+ help="Not using ragged prefill wrapper when running flashinfer mla",
1405
1466
  )
1406
1467
  parser.add_argument(
1407
1468
  "--disable-shared-experts-fusion",
@@ -1418,8 +1479,11 @@ class ServerArgs:
1418
1479
  action="store_true",
1419
1480
  help="Adopt base image processor instead of fast image processor.",
1420
1481
  )
1421
-
1422
- # Server warmups
1482
+ parser.add_argument(
1483
+ "--enable-return-hidden-states",
1484
+ action="store_true",
1485
+ help="Enable returning hidden states with responses.",
1486
+ )
1423
1487
  parser.add_argument(
1424
1488
  "--warmups",
1425
1489
  type=str,
@@ -1447,6 +1511,11 @@ class ServerArgs:
1447
1511
  default=ServerArgs.debug_tensor_dump_inject,
1448
1512
  help="Inject the outputs from jax as the input of every layer.",
1449
1513
  )
1514
+ parser.add_argument(
1515
+ "--debug-tensor-dump-prefill-only",
1516
+ action="store_true",
1517
+ help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
1518
+ )
1450
1519
 
1451
1520
  # Disaggregation
1452
1521
  parser.add_argument(
@@ -1456,6 +1525,13 @@ class ServerArgs:
1456
1525
  choices=["null", "prefill", "decode"],
1457
1526
  help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
1458
1527
  )
1528
+ parser.add_argument(
1529
+ "--disaggregation-transfer-backend",
1530
+ type=str,
1531
+ default=ServerArgs.disaggregation_transfer_backend,
1532
+ choices=["mooncake", "nixl"],
1533
+ help="The backend for disaggregation transfer. Default is mooncake.",
1534
+ )
1459
1535
  parser.add_argument(
1460
1536
  "--disaggregation-bootstrap-port",
1461
1537
  type=int,
@@ -1463,11 +1539,22 @@ class ServerArgs:
1463
1539
  help="Bootstrap server port on the prefill server. Default is 8998.",
1464
1540
  )
1465
1541
  parser.add_argument(
1466
- "--disaggregation-transfer-backend",
1467
- type=str,
1468
- default=ServerArgs.disaggregation_transfer_backend,
1469
- choices=["mooncake", "nixl"],
1470
- help="The backend for disaggregation transfer. Default is mooncake.",
1542
+ "--disaggregation-decode-tp",
1543
+ type=int,
1544
+ default=ServerArgs.disaggregation_decode_tp,
1545
+ help="Decode tp size. If not set, it matches the tp size of the current engine. This is only set on the prefill server.",
1546
+ )
1547
+ parser.add_argument(
1548
+ "--disaggregation-decode-dp",
1549
+ type=int,
1550
+ default=ServerArgs.disaggregation_decode_dp,
1551
+ help="Decode dp size. If not set, it matches the dp size of the current engine. This is only set on the prefill server.",
1552
+ )
1553
+ parser.add_argument(
1554
+ "--disaggregation-prefill-pp",
1555
+ type=int,
1556
+ default=ServerArgs.disaggregation_prefill_pp,
1557
+ help="Prefill pp size. If not set, it is default to 1. This is only set on the decode server.",
1471
1558
  )
1472
1559
  parser.add_argument(
1473
1560
  "--disaggregation-ib-device",
@@ -1477,6 +1564,12 @@ class ServerArgs:
1477
1564
  "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
1478
1565
  "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
1479
1566
  )
1567
+ parser.add_argument(
1568
+ "--num-reserved-decode-tokens",
1569
+ type=int,
1570
+ default=ServerArgs.num_reserved_decode_tokens,
1571
+ help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
1572
+ )
1480
1573
  parser.add_argument(
1481
1574
  "--pdlb-url",
1482
1575
  type=str,
@@ -1484,14 +1577,6 @@ class ServerArgs:
1484
1577
  help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
1485
1578
  )
1486
1579
 
1487
- parser.add_argument(
1488
- "--mm-attention-backend",
1489
- type=str,
1490
- choices=["sdpa", "fa3", "triton_attn"],
1491
- default=ServerArgs.mm_attention_backend,
1492
- help="Set multimodal attention backend.",
1493
- )
1494
-
1495
1580
  @classmethod
1496
1581
  def from_cli_args(cls, args: argparse.Namespace):
1497
1582
  args.tp_size = args.tensor_parallel_size