sglang 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_serving.py +1 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/srt/configs/model_config.py +6 -0
- sglang/srt/conversation.py +6 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -1
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +196 -51
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +15 -9
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +18 -13
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +17 -12
- sglang/srt/disaggregation/prefill.py +128 -43
- sglang/srt/disaggregation/utils.py +127 -123
- sglang/srt/entrypoints/engine.py +15 -1
- sglang/srt/entrypoints/http_server.py +13 -2
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +15 -2
- sglang/srt/layers/attention/cutlass_mla_backend.py +38 -15
- sglang/srt/layers/attention/flashattention_backend.py +53 -64
- sglang/srt/layers/attention/flashinfer_backend.py +1 -2
- sglang/srt/layers/attention/flashinfer_mla_backend.py +22 -24
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/triton_backend.py +119 -119
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +23 -5
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/ep_moe/kernels.py +6 -5
- sglang/srt/layers/moe/ep_moe/layer.py +42 -32
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -4
- sglang/srt/layers/moe/topk.py +16 -8
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8_kernel.py +44 -15
- sglang/srt/layers/quantization/fp8_utils.py +87 -22
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/lora/lora_manager.py +79 -34
- sglang/srt/lora/mem_pool.py +4 -5
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/io_struct.py +28 -4
- sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +39 -6
- sglang/srt/managers/scheduler.py +73 -17
- sglang/srt/managers/tokenizer_manager.py +29 -2
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/model_executor/cuda_graph_runner.py +122 -55
- sglang/srt/model_executor/forward_batch_info.py +14 -5
- sglang/srt/model_executor/model_runner.py +6 -6
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_v2.py +113 -155
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +162 -4
- sglang/srt/openai_api/protocol.py +37 -1
- sglang/srt/sampling/sampling_batch_info.py +24 -0
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server_args.py +318 -233
- sglang/srt/speculative/build_eagle_tree.py +1 -1
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -3
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +5 -2
- sglang/srt/speculative/eagle_utils.py +389 -109
- sglang/srt/speculative/eagle_worker.py +134 -43
- sglang/srt/two_batch_overlap.py +4 -2
- sglang/srt/utils.py +58 -0
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +38 -3
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_utils.py +3 -1
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +5 -5
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +99 -88
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +0 -0
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -28,7 +28,6 @@ from sglang.srt.utils import (
|
|
28
28
|
configure_ipv6,
|
29
29
|
get_device,
|
30
30
|
get_device_memory_capacity,
|
31
|
-
is_cuda,
|
32
31
|
is_flashinfer_available,
|
33
32
|
is_hip,
|
34
33
|
is_port_available,
|
@@ -91,6 +90,7 @@ class ServerArgs:
|
|
91
90
|
download_dir: Optional[str] = None
|
92
91
|
base_gpu_id: int = 0
|
93
92
|
gpu_id_step: int = 1
|
93
|
+
sleep_on_idle: bool = False
|
94
94
|
|
95
95
|
# Logging
|
96
96
|
log_level: str = "info"
|
@@ -112,14 +112,12 @@ class ServerArgs:
|
|
112
112
|
file_storage_path: str = "sglang_storage"
|
113
113
|
enable_cache_report: bool = False
|
114
114
|
reasoning_parser: Optional[str] = None
|
115
|
+
tool_call_parser: Optional[str] = None
|
115
116
|
|
116
117
|
# Data parallelism
|
117
118
|
dp_size: int = 1
|
118
119
|
load_balance_method: str = "round_robin"
|
119
120
|
|
120
|
-
# Expert parallelism
|
121
|
-
ep_size: int = 1
|
122
|
-
|
123
121
|
# Multi-node distributed serving
|
124
122
|
dist_init_addr: Optional[str] = None
|
125
123
|
nnodes: int = 1
|
@@ -138,6 +136,7 @@ class ServerArgs:
|
|
138
136
|
attention_backend: Optional[str] = None
|
139
137
|
sampling_backend: Optional[str] = None
|
140
138
|
grammar_backend: Optional[str] = None
|
139
|
+
mm_attention_backend: Optional[str] = None
|
141
140
|
|
142
141
|
# Speculative decoding
|
143
142
|
speculative_algorithm: Optional[str] = None
|
@@ -149,6 +148,26 @@ class ServerArgs:
|
|
149
148
|
speculative_accept_threshold_acc: float = 1.0
|
150
149
|
speculative_token_map: Optional[str] = None
|
151
150
|
|
151
|
+
# Expert parallelism
|
152
|
+
ep_size: int = 1
|
153
|
+
enable_ep_moe: bool = False
|
154
|
+
enable_deepep_moe: bool = False
|
155
|
+
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
|
156
|
+
ep_num_redundant_experts: int = 0
|
157
|
+
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
|
158
|
+
init_expert_location: str = "trivial"
|
159
|
+
enable_eplb: bool = False
|
160
|
+
eplb_algorithm: str = "auto"
|
161
|
+
eplb_rebalance_num_iterations: int = 1000
|
162
|
+
eplb_rebalance_layers_per_chunk: Optional[int] = None
|
163
|
+
expert_distribution_recorder_mode: Optional[
|
164
|
+
Literal["stat", "stat_approx", "per_pass", "per_token"]
|
165
|
+
] = None
|
166
|
+
expert_distribution_recorder_buffer_size: Optional[int] = None
|
167
|
+
enable_expert_distribution_metrics: bool = False
|
168
|
+
deepep_config: Optional[str] = None
|
169
|
+
moe_dense_tp_size: Optional[int] = None
|
170
|
+
|
152
171
|
# Double Sparsity
|
153
172
|
enable_double_sparsity: bool = False
|
154
173
|
ds_channel_config_path: Optional[str] = None
|
@@ -159,38 +178,24 @@ class ServerArgs:
|
|
159
178
|
|
160
179
|
# Optimization/debug options
|
161
180
|
disable_radix_cache: bool = False
|
181
|
+
cuda_graph_max_bs: Optional[int] = None
|
182
|
+
cuda_graph_bs: Optional[List[int]] = None
|
162
183
|
disable_cuda_graph: bool = False
|
163
184
|
disable_cuda_graph_padding: bool = False
|
185
|
+
enable_profile_cuda_graph: bool = False
|
164
186
|
enable_nccl_nvls: bool = False
|
165
187
|
enable_tokenizer_batch_encode: bool = False
|
166
188
|
disable_outlines_disk_cache: bool = False
|
167
189
|
disable_custom_all_reduce: bool = False
|
168
190
|
enable_mscclpp: bool = False
|
169
191
|
disable_overlap_schedule: bool = False
|
192
|
+
disable_overlap_cg_plan: bool = False
|
170
193
|
enable_mixed_chunk: bool = False
|
171
194
|
enable_dp_attention: bool = False
|
172
195
|
enable_dp_lm_head: bool = False
|
173
196
|
enable_two_batch_overlap: bool = False
|
174
|
-
enable_ep_moe: bool = False
|
175
|
-
enable_deepep_moe: bool = False
|
176
|
-
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
|
177
|
-
ep_num_redundant_experts: int = 0
|
178
|
-
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
|
179
|
-
init_expert_location: str = "trivial"
|
180
|
-
enable_eplb: bool = False
|
181
|
-
eplb_algorithm: str = "auto"
|
182
|
-
eplb_rebalance_num_iterations: int = 1000
|
183
|
-
eplb_rebalance_layers_per_chunk: Optional[int] = None
|
184
|
-
expert_distribution_recorder_mode: Optional[
|
185
|
-
Literal["stat", "stat_approx", "per_pass", "per_token"]
|
186
|
-
] = None
|
187
|
-
expert_distribution_recorder_buffer_size: Optional[int] = None
|
188
|
-
enable_expert_distribution_metrics: bool = False
|
189
|
-
deepep_config: Optional[str] = None
|
190
197
|
enable_torch_compile: bool = False
|
191
198
|
torch_compile_max_bs: int = 32
|
192
|
-
cuda_graph_max_bs: Optional[int] = None
|
193
|
-
cuda_graph_bs: Optional[List[int]] = None
|
194
199
|
torchao_config: str = ""
|
195
200
|
enable_nan_detection: bool = False
|
196
201
|
enable_p2p_check: bool = False
|
@@ -201,29 +206,32 @@ class ServerArgs:
|
|
201
206
|
enable_memory_saver: bool = False
|
202
207
|
allow_auto_truncate: bool = False
|
203
208
|
enable_custom_logit_processor: bool = False
|
204
|
-
tool_call_parser: Optional[str] = None
|
205
209
|
enable_hierarchical_cache: bool = False
|
206
210
|
hicache_ratio: float = 2.0
|
207
211
|
hicache_size: int = 0
|
208
212
|
hicache_write_policy: str = "write_through_selective"
|
209
213
|
flashinfer_mla_disable_ragged: bool = False
|
210
|
-
warmups: Optional[str] = None
|
211
|
-
moe_dense_tp_size: Optional[int] = None
|
212
214
|
disable_shared_experts_fusion: bool = False
|
213
215
|
disable_chunked_prefix_cache: bool = False
|
214
216
|
disable_fast_image_processor: bool = False
|
215
|
-
|
217
|
+
enable_return_hidden_states: bool = False
|
218
|
+
warmups: Optional[str] = None
|
216
219
|
|
217
220
|
# Debug tensor dumps
|
218
221
|
debug_tensor_dump_output_folder: Optional[str] = None
|
219
222
|
debug_tensor_dump_input_file: Optional[str] = None
|
220
223
|
debug_tensor_dump_inject: bool = False
|
224
|
+
debug_tensor_dump_prefill_only: bool = False
|
221
225
|
|
222
226
|
# For PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
|
223
227
|
disaggregation_mode: str = "null"
|
224
|
-
disaggregation_bootstrap_port: int = 8998
|
225
228
|
disaggregation_transfer_backend: str = "mooncake"
|
229
|
+
disaggregation_bootstrap_port: int = 8998
|
230
|
+
disaggregation_decode_tp: Optional[int] = None
|
231
|
+
disaggregation_decode_dp: Optional[int] = None
|
232
|
+
disaggregation_prefill_pp: Optional[int] = 1
|
226
233
|
disaggregation_ib_device: Optional[str] = None
|
234
|
+
num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
|
227
235
|
pdlb_url: Optional[str] = None
|
228
236
|
|
229
237
|
def __post_init__(self):
|
@@ -249,51 +257,72 @@ class ServerArgs:
|
|
249
257
|
|
250
258
|
gpu_mem = get_device_memory_capacity(self.device)
|
251
259
|
|
252
|
-
# Set mem fraction static
|
260
|
+
# Set mem fraction static
|
253
261
|
if self.mem_fraction_static is None:
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
262
|
+
if gpu_mem is not None:
|
263
|
+
# GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
|
264
|
+
# mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
|
265
|
+
|
266
|
+
# We want mem_fraction_static to be as large as possible but still has enough room
|
267
|
+
# for activations and cuda graph buffers. We use the following heuristic to
|
268
|
+
# compute the needed size for activations and cuda graph buffers:
|
269
|
+
# - The size of the activation depends on the chunked_prefill_size and model size.
|
270
|
+
# - The size of cuda graph buffers depends on the cuda graph capture range and model size.
|
271
|
+
# For GPUs with more memory, we use a larger chunked_prefill_size and
|
272
|
+
# capture more cuda graphs, so they need to reserve more memory.
|
273
|
+
parallel_size = self.tp_size * self.pp_size
|
274
|
+
|
275
|
+
if gpu_mem < 20 * 1024:
|
276
|
+
# T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
277
|
+
reserved_mem = (2.8 + parallel_size / 10) * 1024
|
278
|
+
elif gpu_mem < 35 * 1024:
|
279
|
+
# A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
280
|
+
reserved_mem = (2.8 + parallel_size / 10) * 1024
|
281
|
+
elif gpu_mem < 90 * 1024:
|
282
|
+
# H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
|
283
|
+
reserved_mem = (9.5 + parallel_size / 2) * 1024
|
284
|
+
elif gpu_mem < 100 * 1024:
|
285
|
+
# H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
|
286
|
+
reserved_mem = (12 + parallel_size / 2) * 1024
|
287
|
+
elif gpu_mem < 160 * 1024:
|
288
|
+
# H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
|
289
|
+
reserved_mem = (12 + parallel_size / 2) * 1024
|
264
290
|
else:
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
if gpu_mem is not None and gpu_mem > 180 * 1000 and is_cuda():
|
269
|
-
self.mem_fraction_static = 0.79
|
270
|
-
elif gpu_mem is not None and gpu_mem > 96 * 1024:
|
271
|
-
mem_fraction = self.mem_fraction_static
|
272
|
-
# 15 GB + additional 3GB for cuda graph
|
273
|
-
reserve_mem = 1024 * 18
|
274
|
-
# need reserve more memory for spec cuda graph
|
291
|
+
# B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
292
|
+
reserved_mem = 32 * 1024
|
293
|
+
|
275
294
|
if self.speculative_algorithm is not None:
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
295
|
+
# draft model and larger cuda graph buffers
|
296
|
+
reserved_mem += 2 * 1024
|
297
|
+
if self.enable_dp_attention:
|
298
|
+
reserved_mem += 4 * 1024
|
299
|
+
|
300
|
+
self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
|
281
301
|
else:
|
282
|
-
|
283
|
-
self.mem_fraction_static *= 0.95
|
302
|
+
self.mem_fraction_static = 0.88
|
284
303
|
|
285
304
|
# Set chunked prefill size, which depends on the gpu memory capacity
|
286
305
|
if self.chunked_prefill_size is None:
|
287
|
-
if gpu_mem is not None
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
306
|
+
if gpu_mem is not None:
|
307
|
+
if gpu_mem < 35 * 1024: # A10, L40, 4090
|
308
|
+
self.chunked_prefill_size = 2048
|
309
|
+
elif gpu_mem < 160 * 1024: # H100, H200, A100, H20
|
310
|
+
self.chunked_prefill_size = 8192
|
311
|
+
else: # B200, MI300
|
312
|
+
self.chunked_prefill_size = 16384
|
293
313
|
else:
|
294
|
-
self.chunked_prefill_size =
|
314
|
+
self.chunked_prefill_size = 4096
|
295
315
|
assert self.chunked_prefill_size % self.page_size == 0
|
296
316
|
|
317
|
+
# Set cuda graph max batch size
|
318
|
+
if self.cuda_graph_max_bs is None:
|
319
|
+
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
320
|
+
if gpu_mem is not None and gpu_mem < 35 * 1024:
|
321
|
+
if self.tp_size < 4:
|
322
|
+
self.cuda_graph_max_bs = 8
|
323
|
+
else:
|
324
|
+
self.cuda_graph_max_bs = 80
|
325
|
+
|
297
326
|
assert self.moe_dense_tp_size in {
|
298
327
|
1,
|
299
328
|
None,
|
@@ -311,15 +340,6 @@ class ServerArgs:
|
|
311
340
|
)
|
312
341
|
self.page_size = 128
|
313
342
|
|
314
|
-
# Set cuda graph max batch size
|
315
|
-
if self.cuda_graph_max_bs is None:
|
316
|
-
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
317
|
-
if gpu_mem is not None and gpu_mem < 25_000:
|
318
|
-
if self.tp_size < 4:
|
319
|
-
self.cuda_graph_max_bs = 8
|
320
|
-
else:
|
321
|
-
self.cuda_graph_max_bs = 80
|
322
|
-
|
323
343
|
# Set kernel backends for hpu device
|
324
344
|
if self.device == "hpu":
|
325
345
|
self.attention_backend = "torch_native"
|
@@ -390,7 +410,7 @@ class ServerArgs:
|
|
390
410
|
if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
|
391
411
|
self.expert_distribution_recorder_mode = "stat"
|
392
412
|
logger.info(
|
393
|
-
|
413
|
+
"EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
|
394
414
|
)
|
395
415
|
|
396
416
|
if (self.enable_eplb or (self.init_expert_location is not None)) and (
|
@@ -398,7 +418,7 @@ class ServerArgs:
|
|
398
418
|
):
|
399
419
|
self.ep_dispatch_algorithm = "static"
|
400
420
|
logger.info(
|
401
|
-
|
421
|
+
"EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
|
402
422
|
)
|
403
423
|
|
404
424
|
if self.enable_expert_distribution_metrics and (
|
@@ -488,12 +508,27 @@ class ServerArgs:
|
|
488
508
|
self.triton_attention_num_kv_splits = 16
|
489
509
|
|
490
510
|
# PD disaggregation
|
491
|
-
if self.disaggregation_mode == "
|
492
|
-
|
493
|
-
|
494
|
-
|
511
|
+
if self.disaggregation_mode == "decode":
|
512
|
+
assert (
|
513
|
+
self.disaggregation_decode_tp is None
|
514
|
+
), "Cannot set --disaggregation-decode-tp for the decode engine."
|
515
|
+
assert (
|
516
|
+
self.disaggregation_decode_dp is None
|
517
|
+
), "Cannot set --disaggregation-decode-dp for the decode engine."
|
518
|
+
|
495
519
|
self.disable_radix_cache = True
|
496
520
|
logger.warning("KV cache is forced as chunk cache for decode server")
|
521
|
+
elif self.disaggregation_mode == "prefill":
|
522
|
+
if self.disaggregation_decode_tp is None:
|
523
|
+
self.disaggregation_decode_tp = self.tp_size
|
524
|
+
if self.disaggregation_decode_dp is None:
|
525
|
+
self.disaggregation_decode_dp = self.dp_size
|
526
|
+
|
527
|
+
self.disaggregation_prefill_pp = self.pp_size
|
528
|
+
self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
|
529
|
+
|
530
|
+
self.disable_cuda_graph = True
|
531
|
+
logger.warning("Cuda graph is disabled for prefill server")
|
497
532
|
|
498
533
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
499
534
|
"1" if self.enable_torch_compile else "0"
|
@@ -503,6 +538,14 @@ class ServerArgs:
|
|
503
538
|
"1" if self.disable_outlines_disk_cache else "0"
|
504
539
|
)
|
505
540
|
|
541
|
+
def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
|
542
|
+
larger_tp = max(decode_tp, prefill_tp)
|
543
|
+
smaller_tp = min(decode_tp, prefill_tp)
|
544
|
+
assert larger_tp % smaller_tp == 0, (
|
545
|
+
"Different tp size is supported only when one tp is multiple of the other. "
|
546
|
+
f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
|
547
|
+
)
|
548
|
+
|
506
549
|
@staticmethod
|
507
550
|
def add_cli_args(parser: argparse.ArgumentParser):
|
508
551
|
# Model and port args
|
@@ -519,10 +562,16 @@ class ServerArgs:
|
|
519
562
|
help="The path of the tokenizer.",
|
520
563
|
)
|
521
564
|
parser.add_argument(
|
522
|
-
"--host",
|
565
|
+
"--host",
|
566
|
+
type=str,
|
567
|
+
default=ServerArgs.host,
|
568
|
+
help="The host of the HTTP server.",
|
523
569
|
)
|
524
570
|
parser.add_argument(
|
525
|
-
"--port",
|
571
|
+
"--port",
|
572
|
+
type=int,
|
573
|
+
default=ServerArgs.port,
|
574
|
+
help="The port of the HTTP server.",
|
526
575
|
)
|
527
576
|
parser.add_argument(
|
528
577
|
"--tokenizer-mode",
|
@@ -677,6 +726,18 @@ class ServerArgs:
|
|
677
726
|
"name, a tag name, or a commit id. If unspecified, will use "
|
678
727
|
"the default version.",
|
679
728
|
)
|
729
|
+
parser.add_argument(
|
730
|
+
"--impl",
|
731
|
+
type=str,
|
732
|
+
default=ServerArgs.impl,
|
733
|
+
help="Which implementation of the model to use.\n\n"
|
734
|
+
'* "auto" will try to use the SGLang implementation if it exists '
|
735
|
+
"and fall back to the Transformers implementation if no SGLang "
|
736
|
+
"implementation is available.\n"
|
737
|
+
'* "sglang" will use the SGLang model implementation.\n'
|
738
|
+
'* "transformers" will use the Transformers model '
|
739
|
+
"implementation.\n",
|
740
|
+
)
|
680
741
|
|
681
742
|
# Memory and scheduling
|
682
743
|
parser.add_argument(
|
@@ -735,18 +796,6 @@ class ServerArgs:
|
|
735
796
|
default=ServerArgs.page_size,
|
736
797
|
help="The number of tokens in a page.",
|
737
798
|
)
|
738
|
-
parser.add_argument(
|
739
|
-
"--impl",
|
740
|
-
type=str,
|
741
|
-
default=ServerArgs.impl,
|
742
|
-
help="Which implementation of the model to use.\n\n"
|
743
|
-
'* "auto" will try to use the SGLang implementation if it exists '
|
744
|
-
"and fall back to the Transformers implementation if no SGLang "
|
745
|
-
"implementation is available.\n"
|
746
|
-
'* "sglang" will use the SGLang model implementation.\n'
|
747
|
-
'* "transformers" will use the Transformers model '
|
748
|
-
"implementation.\n",
|
749
|
-
)
|
750
799
|
|
751
800
|
# Other runtime options
|
752
801
|
parser.add_argument(
|
@@ -822,6 +871,11 @@ class ServerArgs:
|
|
822
871
|
default=ServerArgs.gpu_id_step,
|
823
872
|
help="The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...",
|
824
873
|
)
|
874
|
+
parser.add_argument(
|
875
|
+
"--sleep-on-idle",
|
876
|
+
action="store_true",
|
877
|
+
help="Reduce CPU usage when sglang is idle.",
|
878
|
+
)
|
825
879
|
|
826
880
|
# Logging
|
827
881
|
parser.add_argument(
|
@@ -929,6 +983,13 @@ class ServerArgs:
|
|
929
983
|
default=ServerArgs.reasoning_parser,
|
930
984
|
help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
|
931
985
|
)
|
986
|
+
parser.add_argument(
|
987
|
+
"--tool-call-parser",
|
988
|
+
type=str,
|
989
|
+
choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
|
990
|
+
default=ServerArgs.tool_call_parser,
|
991
|
+
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
|
992
|
+
)
|
932
993
|
|
933
994
|
# Data parallelism
|
934
995
|
parser.add_argument(
|
@@ -949,15 +1010,6 @@ class ServerArgs:
|
|
949
1010
|
],
|
950
1011
|
)
|
951
1012
|
|
952
|
-
# Expert parallelism
|
953
|
-
parser.add_argument(
|
954
|
-
"--expert-parallel-size",
|
955
|
-
"--ep-size",
|
956
|
-
type=int,
|
957
|
-
default=ServerArgs.ep_size,
|
958
|
-
help="The expert parallelism size.",
|
959
|
-
)
|
960
|
-
|
961
1013
|
# Multi-node distributed serving
|
962
1014
|
parser.add_argument(
|
963
1015
|
"--dist-init-addr",
|
@@ -1038,21 +1090,6 @@ class ServerArgs:
|
|
1038
1090
|
default=ServerArgs.grammar_backend,
|
1039
1091
|
help="Choose the backend for grammar-guided decoding.",
|
1040
1092
|
)
|
1041
|
-
parser.add_argument(
|
1042
|
-
"--enable-flashinfer-mla",
|
1043
|
-
action=DeprecatedAction,
|
1044
|
-
help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
|
1045
|
-
)
|
1046
|
-
parser.add_argument(
|
1047
|
-
"--enable-flashmla",
|
1048
|
-
action=DeprecatedAction,
|
1049
|
-
help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
|
1050
|
-
)
|
1051
|
-
parser.add_argument(
|
1052
|
-
"--flashinfer-mla-disable-ragged",
|
1053
|
-
action="store_true",
|
1054
|
-
help="Not using ragged prefill wrapper when running flashinfer mla",
|
1055
|
-
)
|
1056
1093
|
|
1057
1094
|
# Speculative decoding
|
1058
1095
|
parser.add_argument(
|
@@ -1102,6 +1139,109 @@ class ServerArgs:
|
|
1102
1139
|
help="The path of the draft model's small vocab table.",
|
1103
1140
|
default=ServerArgs.speculative_token_map,
|
1104
1141
|
)
|
1142
|
+
parser.add_argument(
|
1143
|
+
"--mm-attention-backend",
|
1144
|
+
type=str,
|
1145
|
+
choices=["sdpa", "fa3", "triton_attn"],
|
1146
|
+
default=ServerArgs.mm_attention_backend,
|
1147
|
+
help="Set multimodal attention backend.",
|
1148
|
+
)
|
1149
|
+
|
1150
|
+
# Expert parallelism
|
1151
|
+
parser.add_argument(
|
1152
|
+
"--expert-parallel-size",
|
1153
|
+
"--ep-size",
|
1154
|
+
type=int,
|
1155
|
+
default=ServerArgs.ep_size,
|
1156
|
+
help="The expert parallelism size.",
|
1157
|
+
)
|
1158
|
+
parser.add_argument(
|
1159
|
+
"--enable-ep-moe",
|
1160
|
+
action="store_true",
|
1161
|
+
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
|
1162
|
+
)
|
1163
|
+
parser.add_argument(
|
1164
|
+
"--enable-deepep-moe",
|
1165
|
+
action="store_true",
|
1166
|
+
help="Enabling DeepEP MoE implementation for EP MoE.",
|
1167
|
+
)
|
1168
|
+
parser.add_argument(
|
1169
|
+
"--deepep-mode",
|
1170
|
+
type=str,
|
1171
|
+
choices=["normal", "low_latency", "auto"],
|
1172
|
+
default="auto",
|
1173
|
+
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
|
1174
|
+
)
|
1175
|
+
parser.add_argument(
|
1176
|
+
"--ep-num-redundant-experts",
|
1177
|
+
type=int,
|
1178
|
+
default=ServerArgs.ep_num_redundant_experts,
|
1179
|
+
help="Allocate this number of redundant experts in expert parallel.",
|
1180
|
+
)
|
1181
|
+
parser.add_argument(
|
1182
|
+
"--ep-dispatch-algorithm",
|
1183
|
+
type=str,
|
1184
|
+
default=ServerArgs.ep_dispatch_algorithm,
|
1185
|
+
help="The algorithm to choose ranks for redundant experts in expert parallel.",
|
1186
|
+
)
|
1187
|
+
parser.add_argument(
|
1188
|
+
"--init-expert-location",
|
1189
|
+
type=str,
|
1190
|
+
default=ServerArgs.init_expert_location,
|
1191
|
+
help="Initial location of EP experts.",
|
1192
|
+
)
|
1193
|
+
parser.add_argument(
|
1194
|
+
"--enable-eplb",
|
1195
|
+
action="store_true",
|
1196
|
+
help="Enable EPLB algorithm",
|
1197
|
+
)
|
1198
|
+
parser.add_argument(
|
1199
|
+
"--eplb-algorithm",
|
1200
|
+
type=str,
|
1201
|
+
default=ServerArgs.eplb_algorithm,
|
1202
|
+
help="Chosen EPLB algorithm",
|
1203
|
+
)
|
1204
|
+
parser.add_argument(
|
1205
|
+
"--eplb-rebalance-num-iterations",
|
1206
|
+
type=int,
|
1207
|
+
default=ServerArgs.eplb_rebalance_num_iterations,
|
1208
|
+
help="Number of iterations to automatically trigger a EPLB re-balance.",
|
1209
|
+
)
|
1210
|
+
parser.add_argument(
|
1211
|
+
"--eplb-rebalance-layers-per-chunk",
|
1212
|
+
type=int,
|
1213
|
+
default=ServerArgs.eplb_rebalance_layers_per_chunk,
|
1214
|
+
help="Number of layers to rebalance per forward pass.",
|
1215
|
+
)
|
1216
|
+
parser.add_argument(
|
1217
|
+
"--expert-distribution-recorder-mode",
|
1218
|
+
type=str,
|
1219
|
+
default=ServerArgs.expert_distribution_recorder_mode,
|
1220
|
+
help="Mode of expert distribution recorder.",
|
1221
|
+
)
|
1222
|
+
parser.add_argument(
|
1223
|
+
"--expert-distribution-recorder-buffer-size",
|
1224
|
+
type=int,
|
1225
|
+
default=ServerArgs.expert_distribution_recorder_buffer_size,
|
1226
|
+
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
|
1227
|
+
)
|
1228
|
+
parser.add_argument(
|
1229
|
+
"--enable-expert-distribution-metrics",
|
1230
|
+
action="store_true",
|
1231
|
+
help="Enable logging metrics for expert balancedness",
|
1232
|
+
)
|
1233
|
+
parser.add_argument(
|
1234
|
+
"--deepep-config",
|
1235
|
+
type=str,
|
1236
|
+
default=ServerArgs.deepep_config,
|
1237
|
+
help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
|
1238
|
+
)
|
1239
|
+
parser.add_argument(
|
1240
|
+
"--moe-dense-tp-size",
|
1241
|
+
type=int,
|
1242
|
+
default=ServerArgs.moe_dense_tp_size,
|
1243
|
+
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
1244
|
+
)
|
1105
1245
|
|
1106
1246
|
# Double Sparsity
|
1107
1247
|
parser.add_argument(
|
@@ -1146,6 +1286,18 @@ class ServerArgs:
|
|
1146
1286
|
action="store_true",
|
1147
1287
|
help="Disable RadixAttention for prefix caching.",
|
1148
1288
|
)
|
1289
|
+
parser.add_argument(
|
1290
|
+
"--cuda-graph-max-bs",
|
1291
|
+
type=int,
|
1292
|
+
default=ServerArgs.cuda_graph_max_bs,
|
1293
|
+
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
|
1294
|
+
)
|
1295
|
+
parser.add_argument(
|
1296
|
+
"--cuda-graph-bs",
|
1297
|
+
type=int,
|
1298
|
+
nargs="+",
|
1299
|
+
help="Set the list of batch sizes for cuda graph.",
|
1300
|
+
)
|
1149
1301
|
parser.add_argument(
|
1150
1302
|
"--disable-cuda-graph",
|
1151
1303
|
action="store_true",
|
@@ -1156,6 +1308,11 @@ class ServerArgs:
|
|
1156
1308
|
action="store_true",
|
1157
1309
|
help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
|
1158
1310
|
)
|
1311
|
+
parser.add_argument(
|
1312
|
+
"--enable-profile-cuda-graph",
|
1313
|
+
action="store_true",
|
1314
|
+
help="Enable profiling of cuda graph capture.",
|
1315
|
+
)
|
1159
1316
|
parser.add_argument(
|
1160
1317
|
"--enable-nccl-nvls",
|
1161
1318
|
action="store_true",
|
@@ -1186,6 +1343,11 @@ class ServerArgs:
|
|
1186
1343
|
action="store_true",
|
1187
1344
|
help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
|
1188
1345
|
)
|
1346
|
+
parser.add_argument(
|
1347
|
+
"--disable-overlap-cg-plan",
|
1348
|
+
action="store_true",
|
1349
|
+
help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
|
1350
|
+
)
|
1189
1351
|
parser.add_argument(
|
1190
1352
|
"--enable-mixed-chunk",
|
1191
1353
|
action="store_true",
|
@@ -1201,11 +1363,6 @@ class ServerArgs:
|
|
1201
1363
|
action="store_true",
|
1202
1364
|
help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
|
1203
1365
|
)
|
1204
|
-
parser.add_argument(
|
1205
|
-
"--enable-ep-moe",
|
1206
|
-
action="store_true",
|
1207
|
-
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
|
1208
|
-
)
|
1209
1366
|
parser.add_argument(
|
1210
1367
|
"--enable-two-batch-overlap",
|
1211
1368
|
action="store_true",
|
@@ -1222,18 +1379,6 @@ class ServerArgs:
|
|
1222
1379
|
default=ServerArgs.torch_compile_max_bs,
|
1223
1380
|
help="Set the maximum batch size when using torch compile.",
|
1224
1381
|
)
|
1225
|
-
parser.add_argument(
|
1226
|
-
"--cuda-graph-max-bs",
|
1227
|
-
type=int,
|
1228
|
-
default=ServerArgs.cuda_graph_max_bs,
|
1229
|
-
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
|
1230
|
-
)
|
1231
|
-
parser.add_argument(
|
1232
|
-
"--cuda-graph-bs",
|
1233
|
-
type=int,
|
1234
|
-
nargs="+",
|
1235
|
-
help="Set the list of batch sizes for cuda graph.",
|
1236
|
-
)
|
1237
1382
|
parser.add_argument(
|
1238
1383
|
"--torchao-config",
|
1239
1384
|
type=str,
|
@@ -1290,13 +1435,6 @@ class ServerArgs:
|
|
1290
1435
|
action="store_true",
|
1291
1436
|
help="Enable users to pass custom logit processors to the server (disabled by default for security)",
|
1292
1437
|
)
|
1293
|
-
parser.add_argument(
|
1294
|
-
"--tool-call-parser",
|
1295
|
-
type=str,
|
1296
|
-
choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
|
1297
|
-
default=ServerArgs.tool_call_parser,
|
1298
|
-
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
|
1299
|
-
)
|
1300
1438
|
parser.add_argument(
|
1301
1439
|
"--enable-hierarchical-cache",
|
1302
1440
|
action="store_true",
|
@@ -1322,86 +1460,9 @@ class ServerArgs:
|
|
1322
1460
|
help="The write policy of hierarchical cache.",
|
1323
1461
|
)
|
1324
1462
|
parser.add_argument(
|
1325
|
-
"--
|
1326
|
-
action="store_true",
|
1327
|
-
help="Enabling DeepEP MoE implementation for EP MoE.",
|
1328
|
-
)
|
1329
|
-
parser.add_argument(
|
1330
|
-
"--moe-dense-tp-size",
|
1331
|
-
type=int,
|
1332
|
-
default=ServerArgs.moe_dense_tp_size,
|
1333
|
-
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
1334
|
-
)
|
1335
|
-
parser.add_argument(
|
1336
|
-
"--deepep-mode",
|
1337
|
-
type=str,
|
1338
|
-
choices=["normal", "low_latency", "auto"],
|
1339
|
-
default="auto",
|
1340
|
-
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
|
1341
|
-
)
|
1342
|
-
parser.add_argument(
|
1343
|
-
"--ep-num-redundant-experts",
|
1344
|
-
type=int,
|
1345
|
-
default=ServerArgs.ep_num_redundant_experts,
|
1346
|
-
help="Allocate this number of redundant experts in expert parallel.",
|
1347
|
-
)
|
1348
|
-
parser.add_argument(
|
1349
|
-
"--ep-dispatch-algorithm",
|
1350
|
-
type=str,
|
1351
|
-
default=ServerArgs.ep_dispatch_algorithm,
|
1352
|
-
help="The algorithm to choose ranks for redundant experts in expert parallel.",
|
1353
|
-
)
|
1354
|
-
parser.add_argument(
|
1355
|
-
"--init-expert-location",
|
1356
|
-
type=str,
|
1357
|
-
default=ServerArgs.init_expert_location,
|
1358
|
-
help="Initial location of EP experts.",
|
1359
|
-
)
|
1360
|
-
parser.add_argument(
|
1361
|
-
"--enable-eplb",
|
1362
|
-
action="store_true",
|
1363
|
-
help="Enable EPLB algorithm",
|
1364
|
-
)
|
1365
|
-
parser.add_argument(
|
1366
|
-
"--eplb-algorithm",
|
1367
|
-
type=str,
|
1368
|
-
default=ServerArgs.eplb_algorithm,
|
1369
|
-
help="Chosen EPLB algorithm",
|
1370
|
-
)
|
1371
|
-
parser.add_argument(
|
1372
|
-
"--eplb-rebalance-num-iterations",
|
1373
|
-
type=int,
|
1374
|
-
default=ServerArgs.eplb_rebalance_num_iterations,
|
1375
|
-
help="Number of iterations to automatically trigger a EPLB re-balance.",
|
1376
|
-
)
|
1377
|
-
parser.add_argument(
|
1378
|
-
"--eplb-rebalance-layers-per-chunk",
|
1379
|
-
type=int,
|
1380
|
-
default=ServerArgs.eplb_rebalance_layers_per_chunk,
|
1381
|
-
help="Number of layers to rebalance per forward pass.",
|
1382
|
-
)
|
1383
|
-
parser.add_argument(
|
1384
|
-
"--expert-distribution-recorder-mode",
|
1385
|
-
type=str,
|
1386
|
-
default=ServerArgs.expert_distribution_recorder_mode,
|
1387
|
-
help="Mode of expert distribution recorder.",
|
1388
|
-
)
|
1389
|
-
parser.add_argument(
|
1390
|
-
"--expert-distribution-recorder-buffer-size",
|
1391
|
-
type=int,
|
1392
|
-
default=ServerArgs.expert_distribution_recorder_buffer_size,
|
1393
|
-
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
|
1394
|
-
)
|
1395
|
-
parser.add_argument(
|
1396
|
-
"--enable-expert-distribution-metrics",
|
1463
|
+
"--flashinfer-mla-disable-ragged",
|
1397
1464
|
action="store_true",
|
1398
|
-
help="
|
1399
|
-
)
|
1400
|
-
parser.add_argument(
|
1401
|
-
"--deepep-config",
|
1402
|
-
type=str,
|
1403
|
-
default=ServerArgs.deepep_config,
|
1404
|
-
help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
|
1465
|
+
help="Not using ragged prefill wrapper when running flashinfer mla",
|
1405
1466
|
)
|
1406
1467
|
parser.add_argument(
|
1407
1468
|
"--disable-shared-experts-fusion",
|
@@ -1418,8 +1479,11 @@ class ServerArgs:
|
|
1418
1479
|
action="store_true",
|
1419
1480
|
help="Adopt base image processor instead of fast image processor.",
|
1420
1481
|
)
|
1421
|
-
|
1422
|
-
|
1482
|
+
parser.add_argument(
|
1483
|
+
"--enable-return-hidden-states",
|
1484
|
+
action="store_true",
|
1485
|
+
help="Enable returning hidden states with responses.",
|
1486
|
+
)
|
1423
1487
|
parser.add_argument(
|
1424
1488
|
"--warmups",
|
1425
1489
|
type=str,
|
@@ -1447,6 +1511,11 @@ class ServerArgs:
|
|
1447
1511
|
default=ServerArgs.debug_tensor_dump_inject,
|
1448
1512
|
help="Inject the outputs from jax as the input of every layer.",
|
1449
1513
|
)
|
1514
|
+
parser.add_argument(
|
1515
|
+
"--debug-tensor-dump-prefill-only",
|
1516
|
+
action="store_true",
|
1517
|
+
help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
|
1518
|
+
)
|
1450
1519
|
|
1451
1520
|
# Disaggregation
|
1452
1521
|
parser.add_argument(
|
@@ -1456,6 +1525,13 @@ class ServerArgs:
|
|
1456
1525
|
choices=["null", "prefill", "decode"],
|
1457
1526
|
help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
|
1458
1527
|
)
|
1528
|
+
parser.add_argument(
|
1529
|
+
"--disaggregation-transfer-backend",
|
1530
|
+
type=str,
|
1531
|
+
default=ServerArgs.disaggregation_transfer_backend,
|
1532
|
+
choices=["mooncake", "nixl"],
|
1533
|
+
help="The backend for disaggregation transfer. Default is mooncake.",
|
1534
|
+
)
|
1459
1535
|
parser.add_argument(
|
1460
1536
|
"--disaggregation-bootstrap-port",
|
1461
1537
|
type=int,
|
@@ -1463,11 +1539,22 @@ class ServerArgs:
|
|
1463
1539
|
help="Bootstrap server port on the prefill server. Default is 8998.",
|
1464
1540
|
)
|
1465
1541
|
parser.add_argument(
|
1466
|
-
"--disaggregation-
|
1467
|
-
type=
|
1468
|
-
default=ServerArgs.
|
1469
|
-
|
1470
|
-
|
1542
|
+
"--disaggregation-decode-tp",
|
1543
|
+
type=int,
|
1544
|
+
default=ServerArgs.disaggregation_decode_tp,
|
1545
|
+
help="Decode tp size. If not set, it matches the tp size of the current engine. This is only set on the prefill server.",
|
1546
|
+
)
|
1547
|
+
parser.add_argument(
|
1548
|
+
"--disaggregation-decode-dp",
|
1549
|
+
type=int,
|
1550
|
+
default=ServerArgs.disaggregation_decode_dp,
|
1551
|
+
help="Decode dp size. If not set, it matches the dp size of the current engine. This is only set on the prefill server.",
|
1552
|
+
)
|
1553
|
+
parser.add_argument(
|
1554
|
+
"--disaggregation-prefill-pp",
|
1555
|
+
type=int,
|
1556
|
+
default=ServerArgs.disaggregation_prefill_pp,
|
1557
|
+
help="Prefill pp size. If not set, it is default to 1. This is only set on the decode server.",
|
1471
1558
|
)
|
1472
1559
|
parser.add_argument(
|
1473
1560
|
"--disaggregation-ib-device",
|
@@ -1477,6 +1564,12 @@ class ServerArgs:
|
|
1477
1564
|
"or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
|
1478
1565
|
"Default is None, which triggers automatic device detection when mooncake backend is enabled.",
|
1479
1566
|
)
|
1567
|
+
parser.add_argument(
|
1568
|
+
"--num-reserved-decode-tokens",
|
1569
|
+
type=int,
|
1570
|
+
default=ServerArgs.num_reserved_decode_tokens,
|
1571
|
+
help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
|
1572
|
+
)
|
1480
1573
|
parser.add_argument(
|
1481
1574
|
"--pdlb-url",
|
1482
1575
|
type=str,
|
@@ -1484,14 +1577,6 @@ class ServerArgs:
|
|
1484
1577
|
help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
|
1485
1578
|
)
|
1486
1579
|
|
1487
|
-
parser.add_argument(
|
1488
|
-
"--mm-attention-backend",
|
1489
|
-
type=str,
|
1490
|
-
choices=["sdpa", "fa3", "triton_attn"],
|
1491
|
-
default=ServerArgs.mm_attention_backend,
|
1492
|
-
help="Set multimodal attention backend.",
|
1493
|
-
)
|
1494
|
-
|
1495
1580
|
@classmethod
|
1496
1581
|
def from_cli_args(cls, args: argparse.Namespace):
|
1497
1582
|
args.tp_size = args.tensor_parallel_size
|