sglang 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_one_batch.py +8 -6
  4. sglang/bench_serving.py +1 -1
  5. sglang/lang/interpreter.py +40 -1
  6. sglang/lang/ir.py +27 -0
  7. sglang/math_utils.py +8 -0
  8. sglang/srt/_custom_ops.py +2 -2
  9. sglang/srt/code_completion_parser.py +2 -44
  10. sglang/srt/configs/model_config.py +6 -0
  11. sglang/srt/constants.py +3 -0
  12. sglang/srt/conversation.py +19 -3
  13. sglang/srt/custom_op.py +5 -1
  14. sglang/srt/disaggregation/base/__init__.py +1 -1
  15. sglang/srt/disaggregation/base/conn.py +25 -11
  16. sglang/srt/disaggregation/common/__init__.py +5 -1
  17. sglang/srt/disaggregation/common/utils.py +42 -0
  18. sglang/srt/disaggregation/decode.py +211 -72
  19. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
  20. sglang/srt/disaggregation/fake/__init__.py +1 -1
  21. sglang/srt/disaggregation/fake/conn.py +15 -9
  22. sglang/srt/disaggregation/mini_lb.py +34 -4
  23. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  24. sglang/srt/disaggregation/mooncake/conn.py +30 -29
  25. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  26. sglang/srt/disaggregation/nixl/conn.py +17 -12
  27. sglang/srt/disaggregation/prefill.py +144 -55
  28. sglang/srt/disaggregation/utils.py +155 -123
  29. sglang/srt/distributed/parallel_state.py +12 -4
  30. sglang/srt/entrypoints/engine.py +37 -29
  31. sglang/srt/entrypoints/http_server.py +153 -72
  32. sglang/srt/entrypoints/http_server_engine.py +0 -3
  33. sglang/srt/entrypoints/openai/__init__.py +0 -0
  34. sglang/srt/{openai_api → entrypoints/openai}/protocol.py +84 -10
  35. sglang/srt/entrypoints/openai/serving_base.py +149 -0
  36. sglang/srt/entrypoints/openai/serving_chat.py +921 -0
  37. sglang/srt/entrypoints/openai/serving_completions.py +424 -0
  38. sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
  39. sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
  40. sglang/srt/entrypoints/openai/serving_score.py +61 -0
  41. sglang/srt/entrypoints/openai/usage_processor.py +81 -0
  42. sglang/srt/entrypoints/openai/utils.py +72 -0
  43. sglang/srt/eplb_simulator/__init__.py +1 -0
  44. sglang/srt/eplb_simulator/reader.py +51 -0
  45. sglang/srt/function_call/base_format_detector.py +7 -4
  46. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  47. sglang/srt/function_call/ebnf_composer.py +64 -10
  48. sglang/srt/function_call/function_call_parser.py +6 -6
  49. sglang/srt/function_call/llama32_detector.py +1 -1
  50. sglang/srt/function_call/mistral_detector.py +1 -1
  51. sglang/srt/function_call/pythonic_detector.py +1 -1
  52. sglang/srt/function_call/qwen25_detector.py +1 -1
  53. sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
  54. sglang/srt/layers/activation.py +40 -3
  55. sglang/srt/layers/attention/aiter_backend.py +20 -4
  56. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  57. sglang/srt/layers/attention/cutlass_mla_backend.py +39 -15
  58. sglang/srt/layers/attention/flashattention_backend.py +71 -72
  59. sglang/srt/layers/attention/flashinfer_backend.py +10 -8
  60. sglang/srt/layers/attention/flashinfer_mla_backend.py +29 -28
  61. sglang/srt/layers/attention/flashmla_backend.py +7 -12
  62. sglang/srt/layers/attention/tbo_backend.py +3 -3
  63. sglang/srt/layers/attention/triton_backend.py +138 -130
  64. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  65. sglang/srt/layers/attention/vision.py +51 -24
  66. sglang/srt/layers/communicator.py +28 -10
  67. sglang/srt/layers/dp_attention.py +11 -2
  68. sglang/srt/layers/layernorm.py +29 -2
  69. sglang/srt/layers/linear.py +0 -4
  70. sglang/srt/layers/logits_processor.py +2 -14
  71. sglang/srt/layers/moe/ep_moe/kernels.py +165 -7
  72. sglang/srt/layers/moe/ep_moe/layer.py +249 -33
  73. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +7 -4
  76. sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
  77. sglang/srt/layers/moe/topk.py +107 -12
  78. sglang/srt/layers/pooler.py +56 -0
  79. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
  80. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  81. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
  82. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  83. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  84. sglang/srt/layers/quantization/fp8.py +25 -17
  85. sglang/srt/layers/quantization/fp8_kernel.py +44 -15
  86. sglang/srt/layers/quantization/fp8_utils.py +87 -22
  87. sglang/srt/layers/quantization/modelopt_quant.py +62 -8
  88. sglang/srt/layers/quantization/utils.py +5 -2
  89. sglang/srt/layers/radix_attention.py +2 -3
  90. sglang/srt/layers/rotary_embedding.py +42 -2
  91. sglang/srt/layers/sampler.py +1 -1
  92. sglang/srt/lora/lora_manager.py +249 -105
  93. sglang/srt/lora/mem_pool.py +53 -50
  94. sglang/srt/lora/utils.py +1 -1
  95. sglang/srt/managers/cache_controller.py +33 -14
  96. sglang/srt/managers/io_struct.py +31 -10
  97. sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
  98. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  99. sglang/srt/managers/schedule_batch.py +79 -37
  100. sglang/srt/managers/schedule_policy.py +70 -56
  101. sglang/srt/managers/scheduler.py +220 -79
  102. sglang/srt/managers/template_manager.py +226 -0
  103. sglang/srt/managers/tokenizer_manager.py +40 -10
  104. sglang/srt/managers/tp_worker.py +12 -2
  105. sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
  106. sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
  107. sglang/srt/mem_cache/base_prefix_cache.py +52 -8
  108. sglang/srt/mem_cache/chunk_cache.py +11 -15
  109. sglang/srt/mem_cache/hiradix_cache.py +38 -25
  110. sglang/srt/mem_cache/memory_pool.py +213 -505
  111. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  112. sglang/srt/mem_cache/radix_cache.py +56 -28
  113. sglang/srt/model_executor/cuda_graph_runner.py +198 -100
  114. sglang/srt/model_executor/forward_batch_info.py +32 -10
  115. sglang/srt/model_executor/model_runner.py +28 -12
  116. sglang/srt/model_loader/loader.py +16 -2
  117. sglang/srt/model_loader/weight_utils.py +11 -2
  118. sglang/srt/models/bert.py +113 -13
  119. sglang/srt/models/deepseek_nextn.py +29 -27
  120. sglang/srt/models/deepseek_v2.py +213 -173
  121. sglang/srt/models/glm4.py +312 -0
  122. sglang/srt/models/internvl.py +46 -102
  123. sglang/srt/models/mimo_mtp.py +2 -18
  124. sglang/srt/models/roberta.py +117 -9
  125. sglang/srt/models/vila.py +305 -0
  126. sglang/srt/reasoning_parser.py +21 -11
  127. sglang/srt/sampling/sampling_batch_info.py +24 -0
  128. sglang/srt/sampling/sampling_params.py +2 -0
  129. sglang/srt/server_args.py +351 -238
  130. sglang/srt/speculative/build_eagle_tree.py +1 -1
  131. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -9
  132. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +130 -14
  133. sglang/srt/speculative/eagle_utils.py +468 -116
  134. sglang/srt/speculative/eagle_worker.py +258 -84
  135. sglang/srt/torch_memory_saver_adapter.py +19 -15
  136. sglang/srt/two_batch_overlap.py +4 -2
  137. sglang/srt/utils.py +235 -11
  138. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  139. sglang/test/runners.py +38 -3
  140. sglang/test/test_block_fp8.py +1 -0
  141. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  142. sglang/test/test_block_fp8_ep.py +2 -0
  143. sglang/test/test_utils.py +4 -1
  144. sglang/utils.py +9 -0
  145. sglang/version.py +1 -1
  146. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/METADATA +8 -14
  147. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/RECORD +150 -128
  148. sglang/srt/entrypoints/verl_engine.py +0 -179
  149. sglang/srt/openai_api/adapter.py +0 -1990
  150. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
  151. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
  152. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -28,7 +28,6 @@ from sglang.srt.utils import (
28
28
  configure_ipv6,
29
29
  get_device,
30
30
  get_device_memory_capacity,
31
- is_cuda,
32
31
  is_flashinfer_available,
33
32
  is_hip,
34
33
  is_port_available,
@@ -91,6 +90,7 @@ class ServerArgs:
91
90
  download_dir: Optional[str] = None
92
91
  base_gpu_id: int = 0
93
92
  gpu_id_step: int = 1
93
+ sleep_on_idle: bool = False
94
94
 
95
95
  # Logging
96
96
  log_level: str = "info"
@@ -112,14 +112,12 @@ class ServerArgs:
112
112
  file_storage_path: str = "sglang_storage"
113
113
  enable_cache_report: bool = False
114
114
  reasoning_parser: Optional[str] = None
115
+ tool_call_parser: Optional[str] = None
115
116
 
116
117
  # Data parallelism
117
118
  dp_size: int = 1
118
119
  load_balance_method: str = "round_robin"
119
120
 
120
- # Expert parallelism
121
- ep_size: int = 1
122
-
123
121
  # Multi-node distributed serving
124
122
  dist_init_addr: Optional[str] = None
125
123
  nnodes: int = 1
@@ -138,6 +136,7 @@ class ServerArgs:
138
136
  attention_backend: Optional[str] = None
139
137
  sampling_backend: Optional[str] = None
140
138
  grammar_backend: Optional[str] = None
139
+ mm_attention_backend: Optional[str] = None
141
140
 
142
141
  # Speculative decoding
143
142
  speculative_algorithm: Optional[str] = None
@@ -149,6 +148,27 @@ class ServerArgs:
149
148
  speculative_accept_threshold_acc: float = 1.0
150
149
  speculative_token_map: Optional[str] = None
151
150
 
151
+ # Expert parallelism
152
+ ep_size: int = 1
153
+ enable_ep_moe: bool = False
154
+ enable_deepep_moe: bool = False
155
+ enable_flashinfer_moe: bool = False
156
+ deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
157
+ ep_num_redundant_experts: int = 0
158
+ ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
159
+ init_expert_location: str = "trivial"
160
+ enable_eplb: bool = False
161
+ eplb_algorithm: str = "auto"
162
+ eplb_rebalance_num_iterations: int = 1000
163
+ eplb_rebalance_layers_per_chunk: Optional[int] = None
164
+ expert_distribution_recorder_mode: Optional[
165
+ Literal["stat", "stat_approx", "per_pass", "per_token"]
166
+ ] = None
167
+ expert_distribution_recorder_buffer_size: Optional[int] = None
168
+ enable_expert_distribution_metrics: bool = False
169
+ deepep_config: Optional[str] = None
170
+ moe_dense_tp_size: Optional[int] = None
171
+
152
172
  # Double Sparsity
153
173
  enable_double_sparsity: bool = False
154
174
  ds_channel_config_path: Optional[str] = None
@@ -159,38 +179,24 @@ class ServerArgs:
159
179
 
160
180
  # Optimization/debug options
161
181
  disable_radix_cache: bool = False
182
+ cuda_graph_max_bs: Optional[int] = None
183
+ cuda_graph_bs: Optional[List[int]] = None
162
184
  disable_cuda_graph: bool = False
163
185
  disable_cuda_graph_padding: bool = False
186
+ enable_profile_cuda_graph: bool = False
164
187
  enable_nccl_nvls: bool = False
165
188
  enable_tokenizer_batch_encode: bool = False
166
189
  disable_outlines_disk_cache: bool = False
167
190
  disable_custom_all_reduce: bool = False
168
191
  enable_mscclpp: bool = False
169
192
  disable_overlap_schedule: bool = False
193
+ disable_overlap_cg_plan: bool = False
170
194
  enable_mixed_chunk: bool = False
171
195
  enable_dp_attention: bool = False
172
196
  enable_dp_lm_head: bool = False
173
197
  enable_two_batch_overlap: bool = False
174
- enable_ep_moe: bool = False
175
- enable_deepep_moe: bool = False
176
- deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
177
- ep_num_redundant_experts: int = 0
178
- ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
179
- init_expert_location: str = "trivial"
180
- enable_eplb: bool = False
181
- eplb_algorithm: str = "auto"
182
- eplb_rebalance_num_iterations: int = 1000
183
- eplb_rebalance_layers_per_chunk: Optional[int] = None
184
- expert_distribution_recorder_mode: Optional[
185
- Literal["stat", "stat_approx", "per_pass", "per_token"]
186
- ] = None
187
- expert_distribution_recorder_buffer_size: Optional[int] = None
188
- enable_expert_distribution_metrics: bool = False
189
- deepep_config: Optional[str] = None
190
198
  enable_torch_compile: bool = False
191
199
  torch_compile_max_bs: int = 32
192
- cuda_graph_max_bs: Optional[int] = None
193
- cuda_graph_bs: Optional[List[int]] = None
194
200
  torchao_config: str = ""
195
201
  enable_nan_detection: bool = False
196
202
  enable_p2p_check: bool = False
@@ -201,31 +207,38 @@ class ServerArgs:
201
207
  enable_memory_saver: bool = False
202
208
  allow_auto_truncate: bool = False
203
209
  enable_custom_logit_processor: bool = False
204
- tool_call_parser: Optional[str] = None
205
210
  enable_hierarchical_cache: bool = False
206
211
  hicache_ratio: float = 2.0
207
212
  hicache_size: int = 0
208
213
  hicache_write_policy: str = "write_through_selective"
209
214
  flashinfer_mla_disable_ragged: bool = False
210
- warmups: Optional[str] = None
211
- moe_dense_tp_size: Optional[int] = None
212
215
  disable_shared_experts_fusion: bool = False
213
216
  disable_chunked_prefix_cache: bool = False
214
217
  disable_fast_image_processor: bool = False
215
- mm_attention_backend: Optional[str] = None
218
+ enable_return_hidden_states: bool = False
219
+ warmups: Optional[str] = None
216
220
 
217
221
  # Debug tensor dumps
218
222
  debug_tensor_dump_output_folder: Optional[str] = None
219
223
  debug_tensor_dump_input_file: Optional[str] = None
220
224
  debug_tensor_dump_inject: bool = False
225
+ debug_tensor_dump_prefill_only: bool = False
221
226
 
222
227
  # For PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
223
228
  disaggregation_mode: str = "null"
224
- disaggregation_bootstrap_port: int = 8998
225
229
  disaggregation_transfer_backend: str = "mooncake"
230
+ disaggregation_bootstrap_port: int = 8998
231
+ disaggregation_decode_tp: Optional[int] = None
232
+ disaggregation_decode_dp: Optional[int] = None
233
+ disaggregation_prefill_pp: Optional[int] = 1
226
234
  disaggregation_ib_device: Optional[str] = None
235
+ num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
227
236
  pdlb_url: Optional[str] = None
228
237
 
238
+ # For model weight update
239
+ custom_weight_loader: Optional[List[str]] = None
240
+ weight_loader_disable_mmap: bool = False
241
+
229
242
  def __post_init__(self):
230
243
  # Expert parallelism
231
244
  if self.enable_ep_moe:
@@ -233,7 +246,15 @@ class ServerArgs:
233
246
  logger.warning(
234
247
  f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
235
248
  )
236
-
249
+ if self.enable_flashinfer_moe:
250
+ assert (
251
+ self.quantization == "modelopt_fp4"
252
+ ), "modelopt_fp4 quantization is required for Flashinfer MOE"
253
+ os.environ["TRTLLM_ENABLE_PDL"] = "1"
254
+ self.disable_shared_experts_fusion = True
255
+ logger.warning(
256
+ f"Flashinfer MoE is enabled. Shared expert fusion is disabled."
257
+ )
237
258
  # Set missing default values
238
259
  if self.tokenizer_path is None:
239
260
  self.tokenizer_path = self.model_path
@@ -249,51 +270,72 @@ class ServerArgs:
249
270
 
250
271
  gpu_mem = get_device_memory_capacity(self.device)
251
272
 
252
- # Set mem fraction static, which depends on the tensor parallelism size
273
+ # Set mem fraction static
253
274
  if self.mem_fraction_static is None:
254
- parallel_size = self.tp_size * self.pp_size
255
- if gpu_mem is not None and gpu_mem <= 81920:
256
- if parallel_size >= 16:
257
- self.mem_fraction_static = 0.79
258
- elif parallel_size >= 8:
259
- self.mem_fraction_static = 0.81
260
- elif parallel_size >= 4:
261
- self.mem_fraction_static = 0.85
262
- elif parallel_size >= 2:
263
- self.mem_fraction_static = 0.87
275
+ if gpu_mem is not None:
276
+ # GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
277
+ # mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
278
+
279
+ # We want mem_fraction_static to be as large as possible but still has enough room
280
+ # for activations and cuda graph buffers. We use the following heuristic to
281
+ # compute the needed size for activations and cuda graph buffers:
282
+ # - The size of the activation depends on the chunked_prefill_size and model size.
283
+ # - The size of cuda graph buffers depends on the cuda graph capture range and model size.
284
+ # For GPUs with more memory, we use a larger chunked_prefill_size and
285
+ # capture more cuda graphs, so they need to reserve more memory.
286
+ parallel_size = self.tp_size * self.pp_size
287
+
288
+ if gpu_mem < 20 * 1024:
289
+ # T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
290
+ reserved_mem = (2.8 + parallel_size / 10) * 1024
291
+ elif gpu_mem < 35 * 1024:
292
+ # A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
293
+ reserved_mem = (2.8 + parallel_size / 10) * 1024
294
+ elif gpu_mem < 90 * 1024:
295
+ # H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
296
+ reserved_mem = (9.5 + parallel_size / 2) * 1024
297
+ elif gpu_mem < 100 * 1024:
298
+ # H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
299
+ reserved_mem = (12 + parallel_size / 2) * 1024
300
+ elif gpu_mem < 160 * 1024:
301
+ # H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
302
+ reserved_mem = (12 + parallel_size / 2) * 1024
264
303
  else:
265
- self.mem_fraction_static = 0.88
266
- else:
267
- self.mem_fraction_static = 0.88
268
- if gpu_mem is not None and gpu_mem > 180 * 1000 and is_cuda():
269
- self.mem_fraction_static = 0.79
270
- elif gpu_mem is not None and gpu_mem > 96 * 1024:
271
- mem_fraction = self.mem_fraction_static
272
- # 15 GB + additional 3GB for cuda graph
273
- reserve_mem = 1024 * 18
274
- # need reserve more memory for spec cuda graph
304
+ # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
305
+ reserved_mem = 32 * 1024
306
+
275
307
  if self.speculative_algorithm is not None:
276
- reserve_mem = 1024 * 20
277
- self.mem_fraction_static = min(
278
- mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
279
- (gpu_mem - reserve_mem) / gpu_mem,
280
- )
308
+ # draft model and larger cuda graph buffers
309
+ reserved_mem += 2 * 1024
310
+ if self.enable_dp_attention:
311
+ reserved_mem += 4 * 1024
312
+
313
+ self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
281
314
  else:
282
- if self.speculative_algorithm is not None:
283
- self.mem_fraction_static *= 0.95
315
+ self.mem_fraction_static = 0.88
284
316
 
285
317
  # Set chunked prefill size, which depends on the gpu memory capacity
286
318
  if self.chunked_prefill_size is None:
287
- if gpu_mem is not None and gpu_mem > 180_000:
288
- self.chunked_prefill_size = 16384
289
- elif gpu_mem is not None and gpu_mem < 25_000:
290
- self.chunked_prefill_size = 2048
291
- elif self.disaggregation_mode != "null":
292
- self.chunked_prefill_size = 16384
319
+ if gpu_mem is not None:
320
+ if gpu_mem < 35 * 1024: # A10, L40, 4090
321
+ self.chunked_prefill_size = 2048
322
+ elif gpu_mem < 160 * 1024: # H100, H200, A100, H20
323
+ self.chunked_prefill_size = 8192
324
+ else: # B200, MI300
325
+ self.chunked_prefill_size = 16384
293
326
  else:
294
- self.chunked_prefill_size = 8192
327
+ self.chunked_prefill_size = 4096
295
328
  assert self.chunked_prefill_size % self.page_size == 0
296
329
 
330
+ # Set cuda graph max batch size
331
+ if self.cuda_graph_max_bs is None:
332
+ # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
333
+ if gpu_mem is not None and gpu_mem < 35 * 1024:
334
+ if self.tp_size < 4:
335
+ self.cuda_graph_max_bs = 8
336
+ else:
337
+ self.cuda_graph_max_bs = 80
338
+
297
339
  assert self.moe_dense_tp_size in {
298
340
  1,
299
341
  None,
@@ -311,15 +353,6 @@ class ServerArgs:
311
353
  )
312
354
  self.page_size = 128
313
355
 
314
- # Set cuda graph max batch size
315
- if self.cuda_graph_max_bs is None:
316
- # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
317
- if gpu_mem is not None and gpu_mem < 25_000:
318
- if self.tp_size < 4:
319
- self.cuda_graph_max_bs = 8
320
- else:
321
- self.cuda_graph_max_bs = 80
322
-
323
356
  # Set kernel backends for hpu device
324
357
  if self.device == "hpu":
325
358
  self.attention_backend = "torch_native"
@@ -364,7 +397,6 @@ class ServerArgs:
364
397
  ), "Please enable dp attention when setting enable_dp_attention. "
365
398
 
366
399
  # DeepEP MoE
367
- self.enable_sp_layernorm = False
368
400
  if self.enable_deepep_moe:
369
401
  if self.deepep_mode == "auto":
370
402
  assert (
@@ -374,9 +406,6 @@ class ServerArgs:
374
406
  logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
375
407
  self.disable_cuda_graph = True
376
408
  self.ep_size = self.tp_size
377
- self.enable_sp_layernorm = (
378
- self.dp_size < self.tp_size if self.enable_dp_attention else True
379
- )
380
409
  logger.warning(
381
410
  f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
382
411
  )
@@ -390,7 +419,7 @@ class ServerArgs:
390
419
  if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
391
420
  self.expert_distribution_recorder_mode = "stat"
392
421
  logger.info(
393
- f"EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
422
+ "EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
394
423
  )
395
424
 
396
425
  if (self.enable_eplb or (self.init_expert_location is not None)) and (
@@ -398,7 +427,7 @@ class ServerArgs:
398
427
  ):
399
428
  self.ep_dispatch_algorithm = "static"
400
429
  logger.info(
401
- f"EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
430
+ "EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
402
431
  )
403
432
 
404
433
  if self.enable_expert_distribution_metrics and (
@@ -488,12 +517,27 @@ class ServerArgs:
488
517
  self.triton_attention_num_kv_splits = 16
489
518
 
490
519
  # PD disaggregation
491
- if self.disaggregation_mode == "prefill":
492
- self.disable_cuda_graph = True
493
- logger.warning("Cuda graph is disabled for prefill server")
494
- elif self.disaggregation_mode == "decode":
520
+ if self.disaggregation_mode == "decode":
521
+ assert (
522
+ self.disaggregation_decode_tp is None
523
+ ), "Cannot set --disaggregation-decode-tp for the decode engine."
524
+ assert (
525
+ self.disaggregation_decode_dp is None
526
+ ), "Cannot set --disaggregation-decode-dp for the decode engine."
527
+
495
528
  self.disable_radix_cache = True
496
529
  logger.warning("KV cache is forced as chunk cache for decode server")
530
+ elif self.disaggregation_mode == "prefill":
531
+ if self.disaggregation_decode_tp is None:
532
+ self.disaggregation_decode_tp = self.tp_size
533
+ if self.disaggregation_decode_dp is None:
534
+ self.disaggregation_decode_dp = self.dp_size
535
+
536
+ self.disaggregation_prefill_pp = self.pp_size
537
+ self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
538
+
539
+ self.disable_cuda_graph = True
540
+ logger.warning("Cuda graph is disabled for prefill server")
497
541
 
498
542
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
499
543
  "1" if self.enable_torch_compile else "0"
@@ -503,6 +547,17 @@ class ServerArgs:
503
547
  "1" if self.disable_outlines_disk_cache else "0"
504
548
  )
505
549
 
550
+ if self.custom_weight_loader is None:
551
+ self.custom_weight_loader = []
552
+
553
+ def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
554
+ larger_tp = max(decode_tp, prefill_tp)
555
+ smaller_tp = min(decode_tp, prefill_tp)
556
+ assert larger_tp % smaller_tp == 0, (
557
+ "Different tp size is supported only when one tp is multiple of the other. "
558
+ f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
559
+ )
560
+
506
561
  @staticmethod
507
562
  def add_cli_args(parser: argparse.ArgumentParser):
508
563
  # Model and port args
@@ -519,10 +574,16 @@ class ServerArgs:
519
574
  help="The path of the tokenizer.",
520
575
  )
521
576
  parser.add_argument(
522
- "--host", type=str, default=ServerArgs.host, help="The host of the server."
577
+ "--host",
578
+ type=str,
579
+ default=ServerArgs.host,
580
+ help="The host of the HTTP server.",
523
581
  )
524
582
  parser.add_argument(
525
- "--port", type=int, default=ServerArgs.port, help="The port of the server."
583
+ "--port",
584
+ type=int,
585
+ default=ServerArgs.port,
586
+ help="The port of the HTTP server.",
526
587
  )
527
588
  parser.add_argument(
528
589
  "--tokenizer-mode",
@@ -677,6 +738,18 @@ class ServerArgs:
677
738
  "name, a tag name, or a commit id. If unspecified, will use "
678
739
  "the default version.",
679
740
  )
741
+ parser.add_argument(
742
+ "--impl",
743
+ type=str,
744
+ default=ServerArgs.impl,
745
+ help="Which implementation of the model to use.\n\n"
746
+ '* "auto" will try to use the SGLang implementation if it exists '
747
+ "and fall back to the Transformers implementation if no SGLang "
748
+ "implementation is available.\n"
749
+ '* "sglang" will use the SGLang model implementation.\n'
750
+ '* "transformers" will use the Transformers model '
751
+ "implementation.\n",
752
+ )
680
753
 
681
754
  # Memory and scheduling
682
755
  parser.add_argument(
@@ -735,18 +808,6 @@ class ServerArgs:
735
808
  default=ServerArgs.page_size,
736
809
  help="The number of tokens in a page.",
737
810
  )
738
- parser.add_argument(
739
- "--impl",
740
- type=str,
741
- default=ServerArgs.impl,
742
- help="Which implementation of the model to use.\n\n"
743
- '* "auto" will try to use the SGLang implementation if it exists '
744
- "and fall back to the Transformers implementation if no SGLang "
745
- "implementation is available.\n"
746
- '* "sglang" will use the SGLang model implementation.\n'
747
- '* "transformers" will use the Transformers model '
748
- "implementation.\n",
749
- )
750
811
 
751
812
  # Other runtime options
752
813
  parser.add_argument(
@@ -822,6 +883,11 @@ class ServerArgs:
822
883
  default=ServerArgs.gpu_id_step,
823
884
  help="The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...",
824
885
  )
886
+ parser.add_argument(
887
+ "--sleep-on-idle",
888
+ action="store_true",
889
+ help="Reduce CPU usage when sglang is idle.",
890
+ )
825
891
 
826
892
  # Logging
827
893
  parser.add_argument(
@@ -929,6 +995,13 @@ class ServerArgs:
929
995
  default=ServerArgs.reasoning_parser,
930
996
  help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
931
997
  )
998
+ parser.add_argument(
999
+ "--tool-call-parser",
1000
+ type=str,
1001
+ choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
1002
+ default=ServerArgs.tool_call_parser,
1003
+ help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
1004
+ )
932
1005
 
933
1006
  # Data parallelism
934
1007
  parser.add_argument(
@@ -949,15 +1022,6 @@ class ServerArgs:
949
1022
  ],
950
1023
  )
951
1024
 
952
- # Expert parallelism
953
- parser.add_argument(
954
- "--expert-parallel-size",
955
- "--ep-size",
956
- type=int,
957
- default=ServerArgs.ep_size,
958
- help="The expert parallelism size.",
959
- )
960
-
961
1025
  # Multi-node distributed serving
962
1026
  parser.add_argument(
963
1027
  "--dist-init-addr",
@@ -1038,21 +1102,6 @@ class ServerArgs:
1038
1102
  default=ServerArgs.grammar_backend,
1039
1103
  help="Choose the backend for grammar-guided decoding.",
1040
1104
  )
1041
- parser.add_argument(
1042
- "--enable-flashinfer-mla",
1043
- action=DeprecatedAction,
1044
- help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
1045
- )
1046
- parser.add_argument(
1047
- "--enable-flashmla",
1048
- action=DeprecatedAction,
1049
- help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
1050
- )
1051
- parser.add_argument(
1052
- "--flashinfer-mla-disable-ragged",
1053
- action="store_true",
1054
- help="Not using ragged prefill wrapper when running flashinfer mla",
1055
- )
1056
1105
 
1057
1106
  # Speculative decoding
1058
1107
  parser.add_argument(
@@ -1102,6 +1151,114 @@ class ServerArgs:
1102
1151
  help="The path of the draft model's small vocab table.",
1103
1152
  default=ServerArgs.speculative_token_map,
1104
1153
  )
1154
+ parser.add_argument(
1155
+ "--mm-attention-backend",
1156
+ type=str,
1157
+ choices=["sdpa", "fa3", "triton_attn"],
1158
+ default=ServerArgs.mm_attention_backend,
1159
+ help="Set multimodal attention backend.",
1160
+ )
1161
+
1162
+ # Expert parallelism
1163
+ parser.add_argument(
1164
+ "--expert-parallel-size",
1165
+ "--ep-size",
1166
+ type=int,
1167
+ default=ServerArgs.ep_size,
1168
+ help="The expert parallelism size.",
1169
+ )
1170
+ parser.add_argument(
1171
+ "--enable-ep-moe",
1172
+ action="store_true",
1173
+ help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
1174
+ )
1175
+ parser.add_argument(
1176
+ "--enable-flashinfer-moe",
1177
+ action="store_true",
1178
+ help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
1179
+ )
1180
+ parser.add_argument(
1181
+ "--enable-deepep-moe",
1182
+ action="store_true",
1183
+ help="Enabling DeepEP MoE implementation for EP MoE.",
1184
+ )
1185
+ parser.add_argument(
1186
+ "--deepep-mode",
1187
+ type=str,
1188
+ choices=["normal", "low_latency", "auto"],
1189
+ default="auto",
1190
+ help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
1191
+ )
1192
+ parser.add_argument(
1193
+ "--ep-num-redundant-experts",
1194
+ type=int,
1195
+ default=ServerArgs.ep_num_redundant_experts,
1196
+ help="Allocate this number of redundant experts in expert parallel.",
1197
+ )
1198
+ parser.add_argument(
1199
+ "--ep-dispatch-algorithm",
1200
+ type=str,
1201
+ default=ServerArgs.ep_dispatch_algorithm,
1202
+ help="The algorithm to choose ranks for redundant experts in expert parallel.",
1203
+ )
1204
+ parser.add_argument(
1205
+ "--init-expert-location",
1206
+ type=str,
1207
+ default=ServerArgs.init_expert_location,
1208
+ help="Initial location of EP experts.",
1209
+ )
1210
+ parser.add_argument(
1211
+ "--enable-eplb",
1212
+ action="store_true",
1213
+ help="Enable EPLB algorithm",
1214
+ )
1215
+ parser.add_argument(
1216
+ "--eplb-algorithm",
1217
+ type=str,
1218
+ default=ServerArgs.eplb_algorithm,
1219
+ help="Chosen EPLB algorithm",
1220
+ )
1221
+ parser.add_argument(
1222
+ "--eplb-rebalance-num-iterations",
1223
+ type=int,
1224
+ default=ServerArgs.eplb_rebalance_num_iterations,
1225
+ help="Number of iterations to automatically trigger a EPLB re-balance.",
1226
+ )
1227
+ parser.add_argument(
1228
+ "--eplb-rebalance-layers-per-chunk",
1229
+ type=int,
1230
+ default=ServerArgs.eplb_rebalance_layers_per_chunk,
1231
+ help="Number of layers to rebalance per forward pass.",
1232
+ )
1233
+ parser.add_argument(
1234
+ "--expert-distribution-recorder-mode",
1235
+ type=str,
1236
+ default=ServerArgs.expert_distribution_recorder_mode,
1237
+ help="Mode of expert distribution recorder.",
1238
+ )
1239
+ parser.add_argument(
1240
+ "--expert-distribution-recorder-buffer-size",
1241
+ type=int,
1242
+ default=ServerArgs.expert_distribution_recorder_buffer_size,
1243
+ help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
1244
+ )
1245
+ parser.add_argument(
1246
+ "--enable-expert-distribution-metrics",
1247
+ action="store_true",
1248
+ help="Enable logging metrics for expert balancedness",
1249
+ )
1250
+ parser.add_argument(
1251
+ "--deepep-config",
1252
+ type=str,
1253
+ default=ServerArgs.deepep_config,
1254
+ help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
1255
+ )
1256
+ parser.add_argument(
1257
+ "--moe-dense-tp-size",
1258
+ type=int,
1259
+ default=ServerArgs.moe_dense_tp_size,
1260
+ help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
1261
+ )
1105
1262
 
1106
1263
  # Double Sparsity
1107
1264
  parser.add_argument(
@@ -1146,6 +1303,18 @@ class ServerArgs:
1146
1303
  action="store_true",
1147
1304
  help="Disable RadixAttention for prefix caching.",
1148
1305
  )
1306
+ parser.add_argument(
1307
+ "--cuda-graph-max-bs",
1308
+ type=int,
1309
+ default=ServerArgs.cuda_graph_max_bs,
1310
+ help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
1311
+ )
1312
+ parser.add_argument(
1313
+ "--cuda-graph-bs",
1314
+ type=int,
1315
+ nargs="+",
1316
+ help="Set the list of batch sizes for cuda graph.",
1317
+ )
1149
1318
  parser.add_argument(
1150
1319
  "--disable-cuda-graph",
1151
1320
  action="store_true",
@@ -1156,6 +1325,11 @@ class ServerArgs:
1156
1325
  action="store_true",
1157
1326
  help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
1158
1327
  )
1328
+ parser.add_argument(
1329
+ "--enable-profile-cuda-graph",
1330
+ action="store_true",
1331
+ help="Enable profiling of cuda graph capture.",
1332
+ )
1159
1333
  parser.add_argument(
1160
1334
  "--enable-nccl-nvls",
1161
1335
  action="store_true",
@@ -1186,6 +1360,11 @@ class ServerArgs:
1186
1360
  action="store_true",
1187
1361
  help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
1188
1362
  )
1363
+ parser.add_argument(
1364
+ "--disable-overlap-cg-plan",
1365
+ action="store_true",
1366
+ help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
1367
+ )
1189
1368
  parser.add_argument(
1190
1369
  "--enable-mixed-chunk",
1191
1370
  action="store_true",
@@ -1201,11 +1380,6 @@ class ServerArgs:
1201
1380
  action="store_true",
1202
1381
  help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
1203
1382
  )
1204
- parser.add_argument(
1205
- "--enable-ep-moe",
1206
- action="store_true",
1207
- help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
1208
- )
1209
1383
  parser.add_argument(
1210
1384
  "--enable-two-batch-overlap",
1211
1385
  action="store_true",
@@ -1222,18 +1396,6 @@ class ServerArgs:
1222
1396
  default=ServerArgs.torch_compile_max_bs,
1223
1397
  help="Set the maximum batch size when using torch compile.",
1224
1398
  )
1225
- parser.add_argument(
1226
- "--cuda-graph-max-bs",
1227
- type=int,
1228
- default=ServerArgs.cuda_graph_max_bs,
1229
- help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
1230
- )
1231
- parser.add_argument(
1232
- "--cuda-graph-bs",
1233
- type=int,
1234
- nargs="+",
1235
- help="Set the list of batch sizes for cuda graph.",
1236
- )
1237
1399
  parser.add_argument(
1238
1400
  "--torchao-config",
1239
1401
  type=str,
@@ -1290,13 +1452,6 @@ class ServerArgs:
1290
1452
  action="store_true",
1291
1453
  help="Enable users to pass custom logit processors to the server (disabled by default for security)",
1292
1454
  )
1293
- parser.add_argument(
1294
- "--tool-call-parser",
1295
- type=str,
1296
- choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
1297
- default=ServerArgs.tool_call_parser,
1298
- help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
1299
- )
1300
1455
  parser.add_argument(
1301
1456
  "--enable-hierarchical-cache",
1302
1457
  action="store_true",
@@ -1322,86 +1477,9 @@ class ServerArgs:
1322
1477
  help="The write policy of hierarchical cache.",
1323
1478
  )
1324
1479
  parser.add_argument(
1325
- "--enable-deepep-moe",
1326
- action="store_true",
1327
- help="Enabling DeepEP MoE implementation for EP MoE.",
1328
- )
1329
- parser.add_argument(
1330
- "--moe-dense-tp-size",
1331
- type=int,
1332
- default=ServerArgs.moe_dense_tp_size,
1333
- help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
1334
- )
1335
- parser.add_argument(
1336
- "--deepep-mode",
1337
- type=str,
1338
- choices=["normal", "low_latency", "auto"],
1339
- default="auto",
1340
- help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
1341
- )
1342
- parser.add_argument(
1343
- "--ep-num-redundant-experts",
1344
- type=int,
1345
- default=ServerArgs.ep_num_redundant_experts,
1346
- help="Allocate this number of redundant experts in expert parallel.",
1347
- )
1348
- parser.add_argument(
1349
- "--ep-dispatch-algorithm",
1350
- type=str,
1351
- default=ServerArgs.ep_dispatch_algorithm,
1352
- help="The algorithm to choose ranks for redundant experts in expert parallel.",
1353
- )
1354
- parser.add_argument(
1355
- "--init-expert-location",
1356
- type=str,
1357
- default=ServerArgs.init_expert_location,
1358
- help="Initial location of EP experts.",
1359
- )
1360
- parser.add_argument(
1361
- "--enable-eplb",
1362
- action="store_true",
1363
- help="Enable EPLB algorithm",
1364
- )
1365
- parser.add_argument(
1366
- "--eplb-algorithm",
1367
- type=str,
1368
- default=ServerArgs.eplb_algorithm,
1369
- help="Chosen EPLB algorithm",
1370
- )
1371
- parser.add_argument(
1372
- "--eplb-rebalance-num-iterations",
1373
- type=int,
1374
- default=ServerArgs.eplb_rebalance_num_iterations,
1375
- help="Number of iterations to automatically trigger a EPLB re-balance.",
1376
- )
1377
- parser.add_argument(
1378
- "--eplb-rebalance-layers-per-chunk",
1379
- type=int,
1380
- default=ServerArgs.eplb_rebalance_layers_per_chunk,
1381
- help="Number of layers to rebalance per forward pass.",
1382
- )
1383
- parser.add_argument(
1384
- "--expert-distribution-recorder-mode",
1385
- type=str,
1386
- default=ServerArgs.expert_distribution_recorder_mode,
1387
- help="Mode of expert distribution recorder.",
1388
- )
1389
- parser.add_argument(
1390
- "--expert-distribution-recorder-buffer-size",
1391
- type=int,
1392
- default=ServerArgs.expert_distribution_recorder_buffer_size,
1393
- help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
1394
- )
1395
- parser.add_argument(
1396
- "--enable-expert-distribution-metrics",
1480
+ "--flashinfer-mla-disable-ragged",
1397
1481
  action="store_true",
1398
- help="Enable logging metrics for expert balancedness",
1399
- )
1400
- parser.add_argument(
1401
- "--deepep-config",
1402
- type=str,
1403
- default=ServerArgs.deepep_config,
1404
- help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
1482
+ help="Not using ragged prefill wrapper when running flashinfer mla",
1405
1483
  )
1406
1484
  parser.add_argument(
1407
1485
  "--disable-shared-experts-fusion",
@@ -1418,8 +1496,11 @@ class ServerArgs:
1418
1496
  action="store_true",
1419
1497
  help="Adopt base image processor instead of fast image processor.",
1420
1498
  )
1421
-
1422
- # Server warmups
1499
+ parser.add_argument(
1500
+ "--enable-return-hidden-states",
1501
+ action="store_true",
1502
+ help="Enable returning hidden states with responses.",
1503
+ )
1423
1504
  parser.add_argument(
1424
1505
  "--warmups",
1425
1506
  type=str,
@@ -1447,6 +1528,11 @@ class ServerArgs:
1447
1528
  default=ServerArgs.debug_tensor_dump_inject,
1448
1529
  help="Inject the outputs from jax as the input of every layer.",
1449
1530
  )
1531
+ parser.add_argument(
1532
+ "--debug-tensor-dump-prefill-only",
1533
+ action="store_true",
1534
+ help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
1535
+ )
1450
1536
 
1451
1537
  # Disaggregation
1452
1538
  parser.add_argument(
@@ -1456,6 +1542,13 @@ class ServerArgs:
1456
1542
  choices=["null", "prefill", "decode"],
1457
1543
  help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
1458
1544
  )
1545
+ parser.add_argument(
1546
+ "--disaggregation-transfer-backend",
1547
+ type=str,
1548
+ default=ServerArgs.disaggregation_transfer_backend,
1549
+ choices=["mooncake", "nixl"],
1550
+ help="The backend for disaggregation transfer. Default is mooncake.",
1551
+ )
1459
1552
  parser.add_argument(
1460
1553
  "--disaggregation-bootstrap-port",
1461
1554
  type=int,
@@ -1463,11 +1556,22 @@ class ServerArgs:
1463
1556
  help="Bootstrap server port on the prefill server. Default is 8998.",
1464
1557
  )
1465
1558
  parser.add_argument(
1466
- "--disaggregation-transfer-backend",
1467
- type=str,
1468
- default=ServerArgs.disaggregation_transfer_backend,
1469
- choices=["mooncake", "nixl"],
1470
- help="The backend for disaggregation transfer. Default is mooncake.",
1559
+ "--disaggregation-decode-tp",
1560
+ type=int,
1561
+ default=ServerArgs.disaggregation_decode_tp,
1562
+ help="Decode tp size. If not set, it matches the tp size of the current engine. This is only set on the prefill server.",
1563
+ )
1564
+ parser.add_argument(
1565
+ "--disaggregation-decode-dp",
1566
+ type=int,
1567
+ default=ServerArgs.disaggregation_decode_dp,
1568
+ help="Decode dp size. If not set, it matches the dp size of the current engine. This is only set on the prefill server.",
1569
+ )
1570
+ parser.add_argument(
1571
+ "--disaggregation-prefill-pp",
1572
+ type=int,
1573
+ default=ServerArgs.disaggregation_prefill_pp,
1574
+ help="Prefill pp size. If not set, it is default to 1. This is only set on the decode server.",
1471
1575
  )
1472
1576
  parser.add_argument(
1473
1577
  "--disaggregation-ib-device",
@@ -1477,19 +1581,29 @@ class ServerArgs:
1477
1581
  "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
1478
1582
  "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
1479
1583
  )
1584
+ parser.add_argument(
1585
+ "--num-reserved-decode-tokens",
1586
+ type=int,
1587
+ default=ServerArgs.num_reserved_decode_tokens,
1588
+ help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
1589
+ )
1480
1590
  parser.add_argument(
1481
1591
  "--pdlb-url",
1482
1592
  type=str,
1483
1593
  default=None,
1484
1594
  help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
1485
1595
  )
1486
-
1487
1596
  parser.add_argument(
1488
- "--mm-attention-backend",
1597
+ "--custom-weight-loader",
1489
1598
  type=str,
1490
- choices=["sdpa", "fa3", "triton_attn"],
1491
- default=ServerArgs.mm_attention_backend,
1492
- help="Set multimodal attention backend.",
1599
+ nargs="*",
1600
+ default=None,
1601
+ help="The custom dataloader which used to update the model. Should be set with a valid import path, such as my_package.weight_load_func",
1602
+ )
1603
+ parser.add_argument(
1604
+ "--weight-loader-disable-mmap",
1605
+ action="store_true",
1606
+ help="Disable mmap while loading weight using safetensors.",
1493
1607
  )
1494
1608
 
1495
1609
  @classmethod
@@ -1615,9 +1729,8 @@ class PortArgs:
1615
1729
  dist_init_host, dist_init_port = dist_init_addr
1616
1730
  port_base = int(dist_init_port) + 1
1617
1731
  if dp_rank is None:
1618
- scheduler_input_port = (
1619
- port_base + 3
1620
- ) # TokenizerManager to DataParallelController
1732
+ # TokenizerManager to DataParallelController
1733
+ scheduler_input_port = port_base + 3
1621
1734
  else:
1622
1735
  scheduler_input_port = port_base + 3 + 1 + dp_rank
1623
1736