sglang 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_one_batch.py +8 -6
- sglang/bench_serving.py +1 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/srt/_custom_ops.py +2 -2
- sglang/srt/code_completion_parser.py +2 -44
- sglang/srt/configs/model_config.py +6 -0
- sglang/srt/constants.py +3 -0
- sglang/srt/conversation.py +19 -3
- sglang/srt/custom_op.py +5 -1
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -1
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +211 -72
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +15 -9
- sglang/srt/disaggregation/mini_lb.py +34 -4
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +30 -29
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +17 -12
- sglang/srt/disaggregation/prefill.py +144 -55
- sglang/srt/disaggregation/utils.py +155 -123
- sglang/srt/distributed/parallel_state.py +12 -4
- sglang/srt/entrypoints/engine.py +37 -29
- sglang/srt/entrypoints/http_server.py +153 -72
- sglang/srt/entrypoints/http_server_engine.py +0 -3
- sglang/srt/entrypoints/openai/__init__.py +0 -0
- sglang/srt/{openai_api → entrypoints/openai}/protocol.py +84 -10
- sglang/srt/entrypoints/openai/serving_base.py +149 -0
- sglang/srt/entrypoints/openai/serving_chat.py +921 -0
- sglang/srt/entrypoints/openai/serving_completions.py +424 -0
- sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
- sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
- sglang/srt/entrypoints/openai/serving_score.py +61 -0
- sglang/srt/entrypoints/openai/usage_processor.py +81 -0
- sglang/srt/entrypoints/openai/utils.py +72 -0
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +7 -4
- sglang/srt/function_call/deepseekv3_detector.py +1 -1
- sglang/srt/function_call/ebnf_composer.py +64 -10
- sglang/srt/function_call/function_call_parser.py +6 -6
- sglang/srt/function_call/llama32_detector.py +1 -1
- sglang/srt/function_call/mistral_detector.py +1 -1
- sglang/srt/function_call/pythonic_detector.py +1 -1
- sglang/srt/function_call/qwen25_detector.py +1 -1
- sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
- sglang/srt/layers/activation.py +40 -3
- sglang/srt/layers/attention/aiter_backend.py +20 -4
- sglang/srt/layers/attention/base_attn_backend.py +1 -1
- sglang/srt/layers/attention/cutlass_mla_backend.py +39 -15
- sglang/srt/layers/attention/flashattention_backend.py +71 -72
- sglang/srt/layers/attention/flashinfer_backend.py +10 -8
- sglang/srt/layers/attention/flashinfer_mla_backend.py +29 -28
- sglang/srt/layers/attention/flashmla_backend.py +7 -12
- sglang/srt/layers/attention/tbo_backend.py +3 -3
- sglang/srt/layers/attention/triton_backend.py +138 -130
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +28 -10
- sglang/srt/layers/dp_attention.py +11 -2
- sglang/srt/layers/layernorm.py +29 -2
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +2 -14
- sglang/srt/layers/moe/ep_moe/kernels.py +165 -7
- sglang/srt/layers/moe/ep_moe/layer.py +249 -33
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +7 -4
- sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
- sglang/srt/layers/moe/topk.py +107 -12
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +25 -17
- sglang/srt/layers/quantization/fp8_kernel.py +44 -15
- sglang/srt/layers/quantization/fp8_utils.py +87 -22
- sglang/srt/layers/quantization/modelopt_quant.py +62 -8
- sglang/srt/layers/quantization/utils.py +5 -2
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +42 -2
- sglang/srt/layers/sampler.py +1 -1
- sglang/srt/lora/lora_manager.py +249 -105
- sglang/srt/lora/mem_pool.py +53 -50
- sglang/srt/lora/utils.py +1 -1
- sglang/srt/managers/cache_controller.py +33 -14
- sglang/srt/managers/io_struct.py +31 -10
- sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +79 -37
- sglang/srt/managers/schedule_policy.py +70 -56
- sglang/srt/managers/scheduler.py +220 -79
- sglang/srt/managers/template_manager.py +226 -0
- sglang/srt/managers/tokenizer_manager.py +40 -10
- sglang/srt/managers/tp_worker.py +12 -2
- sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
- sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
- sglang/srt/mem_cache/base_prefix_cache.py +52 -8
- sglang/srt/mem_cache/chunk_cache.py +11 -15
- sglang/srt/mem_cache/hiradix_cache.py +38 -25
- sglang/srt/mem_cache/memory_pool.py +213 -505
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +56 -28
- sglang/srt/model_executor/cuda_graph_runner.py +198 -100
- sglang/srt/model_executor/forward_batch_info.py +32 -10
- sglang/srt/model_executor/model_runner.py +28 -12
- sglang/srt/model_loader/loader.py +16 -2
- sglang/srt/model_loader/weight_utils.py +11 -2
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +29 -27
- sglang/srt/models/deepseek_v2.py +213 -173
- sglang/srt/models/glm4.py +312 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/mimo_mtp.py +2 -18
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/vila.py +305 -0
- sglang/srt/reasoning_parser.py +21 -11
- sglang/srt/sampling/sampling_batch_info.py +24 -0
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server_args.py +351 -238
- sglang/srt/speculative/build_eagle_tree.py +1 -1
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -9
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +130 -14
- sglang/srt/speculative/eagle_utils.py +468 -116
- sglang/srt/speculative/eagle_worker.py +258 -84
- sglang/srt/torch_memory_saver_adapter.py +19 -15
- sglang/srt/two_batch_overlap.py +4 -2
- sglang/srt/utils.py +235 -11
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +38 -3
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +2 -0
- sglang/test/test_utils.py +4 -1
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/METADATA +8 -14
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/RECORD +150 -128
- sglang/srt/entrypoints/verl_engine.py +0 -179
- sglang/srt/openai_api/adapter.py +0 -1990
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -28,7 +28,6 @@ from sglang.srt.utils import (
|
|
28
28
|
configure_ipv6,
|
29
29
|
get_device,
|
30
30
|
get_device_memory_capacity,
|
31
|
-
is_cuda,
|
32
31
|
is_flashinfer_available,
|
33
32
|
is_hip,
|
34
33
|
is_port_available,
|
@@ -91,6 +90,7 @@ class ServerArgs:
|
|
91
90
|
download_dir: Optional[str] = None
|
92
91
|
base_gpu_id: int = 0
|
93
92
|
gpu_id_step: int = 1
|
93
|
+
sleep_on_idle: bool = False
|
94
94
|
|
95
95
|
# Logging
|
96
96
|
log_level: str = "info"
|
@@ -112,14 +112,12 @@ class ServerArgs:
|
|
112
112
|
file_storage_path: str = "sglang_storage"
|
113
113
|
enable_cache_report: bool = False
|
114
114
|
reasoning_parser: Optional[str] = None
|
115
|
+
tool_call_parser: Optional[str] = None
|
115
116
|
|
116
117
|
# Data parallelism
|
117
118
|
dp_size: int = 1
|
118
119
|
load_balance_method: str = "round_robin"
|
119
120
|
|
120
|
-
# Expert parallelism
|
121
|
-
ep_size: int = 1
|
122
|
-
|
123
121
|
# Multi-node distributed serving
|
124
122
|
dist_init_addr: Optional[str] = None
|
125
123
|
nnodes: int = 1
|
@@ -138,6 +136,7 @@ class ServerArgs:
|
|
138
136
|
attention_backend: Optional[str] = None
|
139
137
|
sampling_backend: Optional[str] = None
|
140
138
|
grammar_backend: Optional[str] = None
|
139
|
+
mm_attention_backend: Optional[str] = None
|
141
140
|
|
142
141
|
# Speculative decoding
|
143
142
|
speculative_algorithm: Optional[str] = None
|
@@ -149,6 +148,27 @@ class ServerArgs:
|
|
149
148
|
speculative_accept_threshold_acc: float = 1.0
|
150
149
|
speculative_token_map: Optional[str] = None
|
151
150
|
|
151
|
+
# Expert parallelism
|
152
|
+
ep_size: int = 1
|
153
|
+
enable_ep_moe: bool = False
|
154
|
+
enable_deepep_moe: bool = False
|
155
|
+
enable_flashinfer_moe: bool = False
|
156
|
+
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
|
157
|
+
ep_num_redundant_experts: int = 0
|
158
|
+
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
|
159
|
+
init_expert_location: str = "trivial"
|
160
|
+
enable_eplb: bool = False
|
161
|
+
eplb_algorithm: str = "auto"
|
162
|
+
eplb_rebalance_num_iterations: int = 1000
|
163
|
+
eplb_rebalance_layers_per_chunk: Optional[int] = None
|
164
|
+
expert_distribution_recorder_mode: Optional[
|
165
|
+
Literal["stat", "stat_approx", "per_pass", "per_token"]
|
166
|
+
] = None
|
167
|
+
expert_distribution_recorder_buffer_size: Optional[int] = None
|
168
|
+
enable_expert_distribution_metrics: bool = False
|
169
|
+
deepep_config: Optional[str] = None
|
170
|
+
moe_dense_tp_size: Optional[int] = None
|
171
|
+
|
152
172
|
# Double Sparsity
|
153
173
|
enable_double_sparsity: bool = False
|
154
174
|
ds_channel_config_path: Optional[str] = None
|
@@ -159,38 +179,24 @@ class ServerArgs:
|
|
159
179
|
|
160
180
|
# Optimization/debug options
|
161
181
|
disable_radix_cache: bool = False
|
182
|
+
cuda_graph_max_bs: Optional[int] = None
|
183
|
+
cuda_graph_bs: Optional[List[int]] = None
|
162
184
|
disable_cuda_graph: bool = False
|
163
185
|
disable_cuda_graph_padding: bool = False
|
186
|
+
enable_profile_cuda_graph: bool = False
|
164
187
|
enable_nccl_nvls: bool = False
|
165
188
|
enable_tokenizer_batch_encode: bool = False
|
166
189
|
disable_outlines_disk_cache: bool = False
|
167
190
|
disable_custom_all_reduce: bool = False
|
168
191
|
enable_mscclpp: bool = False
|
169
192
|
disable_overlap_schedule: bool = False
|
193
|
+
disable_overlap_cg_plan: bool = False
|
170
194
|
enable_mixed_chunk: bool = False
|
171
195
|
enable_dp_attention: bool = False
|
172
196
|
enable_dp_lm_head: bool = False
|
173
197
|
enable_two_batch_overlap: bool = False
|
174
|
-
enable_ep_moe: bool = False
|
175
|
-
enable_deepep_moe: bool = False
|
176
|
-
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
|
177
|
-
ep_num_redundant_experts: int = 0
|
178
|
-
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
|
179
|
-
init_expert_location: str = "trivial"
|
180
|
-
enable_eplb: bool = False
|
181
|
-
eplb_algorithm: str = "auto"
|
182
|
-
eplb_rebalance_num_iterations: int = 1000
|
183
|
-
eplb_rebalance_layers_per_chunk: Optional[int] = None
|
184
|
-
expert_distribution_recorder_mode: Optional[
|
185
|
-
Literal["stat", "stat_approx", "per_pass", "per_token"]
|
186
|
-
] = None
|
187
|
-
expert_distribution_recorder_buffer_size: Optional[int] = None
|
188
|
-
enable_expert_distribution_metrics: bool = False
|
189
|
-
deepep_config: Optional[str] = None
|
190
198
|
enable_torch_compile: bool = False
|
191
199
|
torch_compile_max_bs: int = 32
|
192
|
-
cuda_graph_max_bs: Optional[int] = None
|
193
|
-
cuda_graph_bs: Optional[List[int]] = None
|
194
200
|
torchao_config: str = ""
|
195
201
|
enable_nan_detection: bool = False
|
196
202
|
enable_p2p_check: bool = False
|
@@ -201,31 +207,38 @@ class ServerArgs:
|
|
201
207
|
enable_memory_saver: bool = False
|
202
208
|
allow_auto_truncate: bool = False
|
203
209
|
enable_custom_logit_processor: bool = False
|
204
|
-
tool_call_parser: Optional[str] = None
|
205
210
|
enable_hierarchical_cache: bool = False
|
206
211
|
hicache_ratio: float = 2.0
|
207
212
|
hicache_size: int = 0
|
208
213
|
hicache_write_policy: str = "write_through_selective"
|
209
214
|
flashinfer_mla_disable_ragged: bool = False
|
210
|
-
warmups: Optional[str] = None
|
211
|
-
moe_dense_tp_size: Optional[int] = None
|
212
215
|
disable_shared_experts_fusion: bool = False
|
213
216
|
disable_chunked_prefix_cache: bool = False
|
214
217
|
disable_fast_image_processor: bool = False
|
215
|
-
|
218
|
+
enable_return_hidden_states: bool = False
|
219
|
+
warmups: Optional[str] = None
|
216
220
|
|
217
221
|
# Debug tensor dumps
|
218
222
|
debug_tensor_dump_output_folder: Optional[str] = None
|
219
223
|
debug_tensor_dump_input_file: Optional[str] = None
|
220
224
|
debug_tensor_dump_inject: bool = False
|
225
|
+
debug_tensor_dump_prefill_only: bool = False
|
221
226
|
|
222
227
|
# For PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
|
223
228
|
disaggregation_mode: str = "null"
|
224
|
-
disaggregation_bootstrap_port: int = 8998
|
225
229
|
disaggregation_transfer_backend: str = "mooncake"
|
230
|
+
disaggregation_bootstrap_port: int = 8998
|
231
|
+
disaggregation_decode_tp: Optional[int] = None
|
232
|
+
disaggregation_decode_dp: Optional[int] = None
|
233
|
+
disaggregation_prefill_pp: Optional[int] = 1
|
226
234
|
disaggregation_ib_device: Optional[str] = None
|
235
|
+
num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
|
227
236
|
pdlb_url: Optional[str] = None
|
228
237
|
|
238
|
+
# For model weight update
|
239
|
+
custom_weight_loader: Optional[List[str]] = None
|
240
|
+
weight_loader_disable_mmap: bool = False
|
241
|
+
|
229
242
|
def __post_init__(self):
|
230
243
|
# Expert parallelism
|
231
244
|
if self.enable_ep_moe:
|
@@ -233,7 +246,15 @@ class ServerArgs:
|
|
233
246
|
logger.warning(
|
234
247
|
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
235
248
|
)
|
236
|
-
|
249
|
+
if self.enable_flashinfer_moe:
|
250
|
+
assert (
|
251
|
+
self.quantization == "modelopt_fp4"
|
252
|
+
), "modelopt_fp4 quantization is required for Flashinfer MOE"
|
253
|
+
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
254
|
+
self.disable_shared_experts_fusion = True
|
255
|
+
logger.warning(
|
256
|
+
f"Flashinfer MoE is enabled. Shared expert fusion is disabled."
|
257
|
+
)
|
237
258
|
# Set missing default values
|
238
259
|
if self.tokenizer_path is None:
|
239
260
|
self.tokenizer_path = self.model_path
|
@@ -249,51 +270,72 @@ class ServerArgs:
|
|
249
270
|
|
250
271
|
gpu_mem = get_device_memory_capacity(self.device)
|
251
272
|
|
252
|
-
# Set mem fraction static
|
273
|
+
# Set mem fraction static
|
253
274
|
if self.mem_fraction_static is None:
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
275
|
+
if gpu_mem is not None:
|
276
|
+
# GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
|
277
|
+
# mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
|
278
|
+
|
279
|
+
# We want mem_fraction_static to be as large as possible but still has enough room
|
280
|
+
# for activations and cuda graph buffers. We use the following heuristic to
|
281
|
+
# compute the needed size for activations and cuda graph buffers:
|
282
|
+
# - The size of the activation depends on the chunked_prefill_size and model size.
|
283
|
+
# - The size of cuda graph buffers depends on the cuda graph capture range and model size.
|
284
|
+
# For GPUs with more memory, we use a larger chunked_prefill_size and
|
285
|
+
# capture more cuda graphs, so they need to reserve more memory.
|
286
|
+
parallel_size = self.tp_size * self.pp_size
|
287
|
+
|
288
|
+
if gpu_mem < 20 * 1024:
|
289
|
+
# T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
290
|
+
reserved_mem = (2.8 + parallel_size / 10) * 1024
|
291
|
+
elif gpu_mem < 35 * 1024:
|
292
|
+
# A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
293
|
+
reserved_mem = (2.8 + parallel_size / 10) * 1024
|
294
|
+
elif gpu_mem < 90 * 1024:
|
295
|
+
# H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
|
296
|
+
reserved_mem = (9.5 + parallel_size / 2) * 1024
|
297
|
+
elif gpu_mem < 100 * 1024:
|
298
|
+
# H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
|
299
|
+
reserved_mem = (12 + parallel_size / 2) * 1024
|
300
|
+
elif gpu_mem < 160 * 1024:
|
301
|
+
# H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
|
302
|
+
reserved_mem = (12 + parallel_size / 2) * 1024
|
264
303
|
else:
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
if gpu_mem is not None and gpu_mem > 180 * 1000 and is_cuda():
|
269
|
-
self.mem_fraction_static = 0.79
|
270
|
-
elif gpu_mem is not None and gpu_mem > 96 * 1024:
|
271
|
-
mem_fraction = self.mem_fraction_static
|
272
|
-
# 15 GB + additional 3GB for cuda graph
|
273
|
-
reserve_mem = 1024 * 18
|
274
|
-
# need reserve more memory for spec cuda graph
|
304
|
+
# B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
305
|
+
reserved_mem = 32 * 1024
|
306
|
+
|
275
307
|
if self.speculative_algorithm is not None:
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
308
|
+
# draft model and larger cuda graph buffers
|
309
|
+
reserved_mem += 2 * 1024
|
310
|
+
if self.enable_dp_attention:
|
311
|
+
reserved_mem += 4 * 1024
|
312
|
+
|
313
|
+
self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
|
281
314
|
else:
|
282
|
-
|
283
|
-
self.mem_fraction_static *= 0.95
|
315
|
+
self.mem_fraction_static = 0.88
|
284
316
|
|
285
317
|
# Set chunked prefill size, which depends on the gpu memory capacity
|
286
318
|
if self.chunked_prefill_size is None:
|
287
|
-
if gpu_mem is not None
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
319
|
+
if gpu_mem is not None:
|
320
|
+
if gpu_mem < 35 * 1024: # A10, L40, 4090
|
321
|
+
self.chunked_prefill_size = 2048
|
322
|
+
elif gpu_mem < 160 * 1024: # H100, H200, A100, H20
|
323
|
+
self.chunked_prefill_size = 8192
|
324
|
+
else: # B200, MI300
|
325
|
+
self.chunked_prefill_size = 16384
|
293
326
|
else:
|
294
|
-
self.chunked_prefill_size =
|
327
|
+
self.chunked_prefill_size = 4096
|
295
328
|
assert self.chunked_prefill_size % self.page_size == 0
|
296
329
|
|
330
|
+
# Set cuda graph max batch size
|
331
|
+
if self.cuda_graph_max_bs is None:
|
332
|
+
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
333
|
+
if gpu_mem is not None and gpu_mem < 35 * 1024:
|
334
|
+
if self.tp_size < 4:
|
335
|
+
self.cuda_graph_max_bs = 8
|
336
|
+
else:
|
337
|
+
self.cuda_graph_max_bs = 80
|
338
|
+
|
297
339
|
assert self.moe_dense_tp_size in {
|
298
340
|
1,
|
299
341
|
None,
|
@@ -311,15 +353,6 @@ class ServerArgs:
|
|
311
353
|
)
|
312
354
|
self.page_size = 128
|
313
355
|
|
314
|
-
# Set cuda graph max batch size
|
315
|
-
if self.cuda_graph_max_bs is None:
|
316
|
-
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
317
|
-
if gpu_mem is not None and gpu_mem < 25_000:
|
318
|
-
if self.tp_size < 4:
|
319
|
-
self.cuda_graph_max_bs = 8
|
320
|
-
else:
|
321
|
-
self.cuda_graph_max_bs = 80
|
322
|
-
|
323
356
|
# Set kernel backends for hpu device
|
324
357
|
if self.device == "hpu":
|
325
358
|
self.attention_backend = "torch_native"
|
@@ -364,7 +397,6 @@ class ServerArgs:
|
|
364
397
|
), "Please enable dp attention when setting enable_dp_attention. "
|
365
398
|
|
366
399
|
# DeepEP MoE
|
367
|
-
self.enable_sp_layernorm = False
|
368
400
|
if self.enable_deepep_moe:
|
369
401
|
if self.deepep_mode == "auto":
|
370
402
|
assert (
|
@@ -374,9 +406,6 @@ class ServerArgs:
|
|
374
406
|
logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
|
375
407
|
self.disable_cuda_graph = True
|
376
408
|
self.ep_size = self.tp_size
|
377
|
-
self.enable_sp_layernorm = (
|
378
|
-
self.dp_size < self.tp_size if self.enable_dp_attention else True
|
379
|
-
)
|
380
409
|
logger.warning(
|
381
410
|
f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
382
411
|
)
|
@@ -390,7 +419,7 @@ class ServerArgs:
|
|
390
419
|
if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
|
391
420
|
self.expert_distribution_recorder_mode = "stat"
|
392
421
|
logger.info(
|
393
|
-
|
422
|
+
"EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
|
394
423
|
)
|
395
424
|
|
396
425
|
if (self.enable_eplb or (self.init_expert_location is not None)) and (
|
@@ -398,7 +427,7 @@ class ServerArgs:
|
|
398
427
|
):
|
399
428
|
self.ep_dispatch_algorithm = "static"
|
400
429
|
logger.info(
|
401
|
-
|
430
|
+
"EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
|
402
431
|
)
|
403
432
|
|
404
433
|
if self.enable_expert_distribution_metrics and (
|
@@ -488,12 +517,27 @@ class ServerArgs:
|
|
488
517
|
self.triton_attention_num_kv_splits = 16
|
489
518
|
|
490
519
|
# PD disaggregation
|
491
|
-
if self.disaggregation_mode == "
|
492
|
-
|
493
|
-
|
494
|
-
|
520
|
+
if self.disaggregation_mode == "decode":
|
521
|
+
assert (
|
522
|
+
self.disaggregation_decode_tp is None
|
523
|
+
), "Cannot set --disaggregation-decode-tp for the decode engine."
|
524
|
+
assert (
|
525
|
+
self.disaggregation_decode_dp is None
|
526
|
+
), "Cannot set --disaggregation-decode-dp for the decode engine."
|
527
|
+
|
495
528
|
self.disable_radix_cache = True
|
496
529
|
logger.warning("KV cache is forced as chunk cache for decode server")
|
530
|
+
elif self.disaggregation_mode == "prefill":
|
531
|
+
if self.disaggregation_decode_tp is None:
|
532
|
+
self.disaggregation_decode_tp = self.tp_size
|
533
|
+
if self.disaggregation_decode_dp is None:
|
534
|
+
self.disaggregation_decode_dp = self.dp_size
|
535
|
+
|
536
|
+
self.disaggregation_prefill_pp = self.pp_size
|
537
|
+
self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
|
538
|
+
|
539
|
+
self.disable_cuda_graph = True
|
540
|
+
logger.warning("Cuda graph is disabled for prefill server")
|
497
541
|
|
498
542
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
499
543
|
"1" if self.enable_torch_compile else "0"
|
@@ -503,6 +547,17 @@ class ServerArgs:
|
|
503
547
|
"1" if self.disable_outlines_disk_cache else "0"
|
504
548
|
)
|
505
549
|
|
550
|
+
if self.custom_weight_loader is None:
|
551
|
+
self.custom_weight_loader = []
|
552
|
+
|
553
|
+
def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
|
554
|
+
larger_tp = max(decode_tp, prefill_tp)
|
555
|
+
smaller_tp = min(decode_tp, prefill_tp)
|
556
|
+
assert larger_tp % smaller_tp == 0, (
|
557
|
+
"Different tp size is supported only when one tp is multiple of the other. "
|
558
|
+
f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
|
559
|
+
)
|
560
|
+
|
506
561
|
@staticmethod
|
507
562
|
def add_cli_args(parser: argparse.ArgumentParser):
|
508
563
|
# Model and port args
|
@@ -519,10 +574,16 @@ class ServerArgs:
|
|
519
574
|
help="The path of the tokenizer.",
|
520
575
|
)
|
521
576
|
parser.add_argument(
|
522
|
-
"--host",
|
577
|
+
"--host",
|
578
|
+
type=str,
|
579
|
+
default=ServerArgs.host,
|
580
|
+
help="The host of the HTTP server.",
|
523
581
|
)
|
524
582
|
parser.add_argument(
|
525
|
-
"--port",
|
583
|
+
"--port",
|
584
|
+
type=int,
|
585
|
+
default=ServerArgs.port,
|
586
|
+
help="The port of the HTTP server.",
|
526
587
|
)
|
527
588
|
parser.add_argument(
|
528
589
|
"--tokenizer-mode",
|
@@ -677,6 +738,18 @@ class ServerArgs:
|
|
677
738
|
"name, a tag name, or a commit id. If unspecified, will use "
|
678
739
|
"the default version.",
|
679
740
|
)
|
741
|
+
parser.add_argument(
|
742
|
+
"--impl",
|
743
|
+
type=str,
|
744
|
+
default=ServerArgs.impl,
|
745
|
+
help="Which implementation of the model to use.\n\n"
|
746
|
+
'* "auto" will try to use the SGLang implementation if it exists '
|
747
|
+
"and fall back to the Transformers implementation if no SGLang "
|
748
|
+
"implementation is available.\n"
|
749
|
+
'* "sglang" will use the SGLang model implementation.\n'
|
750
|
+
'* "transformers" will use the Transformers model '
|
751
|
+
"implementation.\n",
|
752
|
+
)
|
680
753
|
|
681
754
|
# Memory and scheduling
|
682
755
|
parser.add_argument(
|
@@ -735,18 +808,6 @@ class ServerArgs:
|
|
735
808
|
default=ServerArgs.page_size,
|
736
809
|
help="The number of tokens in a page.",
|
737
810
|
)
|
738
|
-
parser.add_argument(
|
739
|
-
"--impl",
|
740
|
-
type=str,
|
741
|
-
default=ServerArgs.impl,
|
742
|
-
help="Which implementation of the model to use.\n\n"
|
743
|
-
'* "auto" will try to use the SGLang implementation if it exists '
|
744
|
-
"and fall back to the Transformers implementation if no SGLang "
|
745
|
-
"implementation is available.\n"
|
746
|
-
'* "sglang" will use the SGLang model implementation.\n'
|
747
|
-
'* "transformers" will use the Transformers model '
|
748
|
-
"implementation.\n",
|
749
|
-
)
|
750
811
|
|
751
812
|
# Other runtime options
|
752
813
|
parser.add_argument(
|
@@ -822,6 +883,11 @@ class ServerArgs:
|
|
822
883
|
default=ServerArgs.gpu_id_step,
|
823
884
|
help="The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...",
|
824
885
|
)
|
886
|
+
parser.add_argument(
|
887
|
+
"--sleep-on-idle",
|
888
|
+
action="store_true",
|
889
|
+
help="Reduce CPU usage when sglang is idle.",
|
890
|
+
)
|
825
891
|
|
826
892
|
# Logging
|
827
893
|
parser.add_argument(
|
@@ -929,6 +995,13 @@ class ServerArgs:
|
|
929
995
|
default=ServerArgs.reasoning_parser,
|
930
996
|
help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
|
931
997
|
)
|
998
|
+
parser.add_argument(
|
999
|
+
"--tool-call-parser",
|
1000
|
+
type=str,
|
1001
|
+
choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
|
1002
|
+
default=ServerArgs.tool_call_parser,
|
1003
|
+
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
|
1004
|
+
)
|
932
1005
|
|
933
1006
|
# Data parallelism
|
934
1007
|
parser.add_argument(
|
@@ -949,15 +1022,6 @@ class ServerArgs:
|
|
949
1022
|
],
|
950
1023
|
)
|
951
1024
|
|
952
|
-
# Expert parallelism
|
953
|
-
parser.add_argument(
|
954
|
-
"--expert-parallel-size",
|
955
|
-
"--ep-size",
|
956
|
-
type=int,
|
957
|
-
default=ServerArgs.ep_size,
|
958
|
-
help="The expert parallelism size.",
|
959
|
-
)
|
960
|
-
|
961
1025
|
# Multi-node distributed serving
|
962
1026
|
parser.add_argument(
|
963
1027
|
"--dist-init-addr",
|
@@ -1038,21 +1102,6 @@ class ServerArgs:
|
|
1038
1102
|
default=ServerArgs.grammar_backend,
|
1039
1103
|
help="Choose the backend for grammar-guided decoding.",
|
1040
1104
|
)
|
1041
|
-
parser.add_argument(
|
1042
|
-
"--enable-flashinfer-mla",
|
1043
|
-
action=DeprecatedAction,
|
1044
|
-
help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
|
1045
|
-
)
|
1046
|
-
parser.add_argument(
|
1047
|
-
"--enable-flashmla",
|
1048
|
-
action=DeprecatedAction,
|
1049
|
-
help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
|
1050
|
-
)
|
1051
|
-
parser.add_argument(
|
1052
|
-
"--flashinfer-mla-disable-ragged",
|
1053
|
-
action="store_true",
|
1054
|
-
help="Not using ragged prefill wrapper when running flashinfer mla",
|
1055
|
-
)
|
1056
1105
|
|
1057
1106
|
# Speculative decoding
|
1058
1107
|
parser.add_argument(
|
@@ -1102,6 +1151,114 @@ class ServerArgs:
|
|
1102
1151
|
help="The path of the draft model's small vocab table.",
|
1103
1152
|
default=ServerArgs.speculative_token_map,
|
1104
1153
|
)
|
1154
|
+
parser.add_argument(
|
1155
|
+
"--mm-attention-backend",
|
1156
|
+
type=str,
|
1157
|
+
choices=["sdpa", "fa3", "triton_attn"],
|
1158
|
+
default=ServerArgs.mm_attention_backend,
|
1159
|
+
help="Set multimodal attention backend.",
|
1160
|
+
)
|
1161
|
+
|
1162
|
+
# Expert parallelism
|
1163
|
+
parser.add_argument(
|
1164
|
+
"--expert-parallel-size",
|
1165
|
+
"--ep-size",
|
1166
|
+
type=int,
|
1167
|
+
default=ServerArgs.ep_size,
|
1168
|
+
help="The expert parallelism size.",
|
1169
|
+
)
|
1170
|
+
parser.add_argument(
|
1171
|
+
"--enable-ep-moe",
|
1172
|
+
action="store_true",
|
1173
|
+
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
|
1174
|
+
)
|
1175
|
+
parser.add_argument(
|
1176
|
+
"--enable-flashinfer-moe",
|
1177
|
+
action="store_true",
|
1178
|
+
help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
|
1179
|
+
)
|
1180
|
+
parser.add_argument(
|
1181
|
+
"--enable-deepep-moe",
|
1182
|
+
action="store_true",
|
1183
|
+
help="Enabling DeepEP MoE implementation for EP MoE.",
|
1184
|
+
)
|
1185
|
+
parser.add_argument(
|
1186
|
+
"--deepep-mode",
|
1187
|
+
type=str,
|
1188
|
+
choices=["normal", "low_latency", "auto"],
|
1189
|
+
default="auto",
|
1190
|
+
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
|
1191
|
+
)
|
1192
|
+
parser.add_argument(
|
1193
|
+
"--ep-num-redundant-experts",
|
1194
|
+
type=int,
|
1195
|
+
default=ServerArgs.ep_num_redundant_experts,
|
1196
|
+
help="Allocate this number of redundant experts in expert parallel.",
|
1197
|
+
)
|
1198
|
+
parser.add_argument(
|
1199
|
+
"--ep-dispatch-algorithm",
|
1200
|
+
type=str,
|
1201
|
+
default=ServerArgs.ep_dispatch_algorithm,
|
1202
|
+
help="The algorithm to choose ranks for redundant experts in expert parallel.",
|
1203
|
+
)
|
1204
|
+
parser.add_argument(
|
1205
|
+
"--init-expert-location",
|
1206
|
+
type=str,
|
1207
|
+
default=ServerArgs.init_expert_location,
|
1208
|
+
help="Initial location of EP experts.",
|
1209
|
+
)
|
1210
|
+
parser.add_argument(
|
1211
|
+
"--enable-eplb",
|
1212
|
+
action="store_true",
|
1213
|
+
help="Enable EPLB algorithm",
|
1214
|
+
)
|
1215
|
+
parser.add_argument(
|
1216
|
+
"--eplb-algorithm",
|
1217
|
+
type=str,
|
1218
|
+
default=ServerArgs.eplb_algorithm,
|
1219
|
+
help="Chosen EPLB algorithm",
|
1220
|
+
)
|
1221
|
+
parser.add_argument(
|
1222
|
+
"--eplb-rebalance-num-iterations",
|
1223
|
+
type=int,
|
1224
|
+
default=ServerArgs.eplb_rebalance_num_iterations,
|
1225
|
+
help="Number of iterations to automatically trigger a EPLB re-balance.",
|
1226
|
+
)
|
1227
|
+
parser.add_argument(
|
1228
|
+
"--eplb-rebalance-layers-per-chunk",
|
1229
|
+
type=int,
|
1230
|
+
default=ServerArgs.eplb_rebalance_layers_per_chunk,
|
1231
|
+
help="Number of layers to rebalance per forward pass.",
|
1232
|
+
)
|
1233
|
+
parser.add_argument(
|
1234
|
+
"--expert-distribution-recorder-mode",
|
1235
|
+
type=str,
|
1236
|
+
default=ServerArgs.expert_distribution_recorder_mode,
|
1237
|
+
help="Mode of expert distribution recorder.",
|
1238
|
+
)
|
1239
|
+
parser.add_argument(
|
1240
|
+
"--expert-distribution-recorder-buffer-size",
|
1241
|
+
type=int,
|
1242
|
+
default=ServerArgs.expert_distribution_recorder_buffer_size,
|
1243
|
+
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
|
1244
|
+
)
|
1245
|
+
parser.add_argument(
|
1246
|
+
"--enable-expert-distribution-metrics",
|
1247
|
+
action="store_true",
|
1248
|
+
help="Enable logging metrics for expert balancedness",
|
1249
|
+
)
|
1250
|
+
parser.add_argument(
|
1251
|
+
"--deepep-config",
|
1252
|
+
type=str,
|
1253
|
+
default=ServerArgs.deepep_config,
|
1254
|
+
help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
|
1255
|
+
)
|
1256
|
+
parser.add_argument(
|
1257
|
+
"--moe-dense-tp-size",
|
1258
|
+
type=int,
|
1259
|
+
default=ServerArgs.moe_dense_tp_size,
|
1260
|
+
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
1261
|
+
)
|
1105
1262
|
|
1106
1263
|
# Double Sparsity
|
1107
1264
|
parser.add_argument(
|
@@ -1146,6 +1303,18 @@ class ServerArgs:
|
|
1146
1303
|
action="store_true",
|
1147
1304
|
help="Disable RadixAttention for prefix caching.",
|
1148
1305
|
)
|
1306
|
+
parser.add_argument(
|
1307
|
+
"--cuda-graph-max-bs",
|
1308
|
+
type=int,
|
1309
|
+
default=ServerArgs.cuda_graph_max_bs,
|
1310
|
+
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
|
1311
|
+
)
|
1312
|
+
parser.add_argument(
|
1313
|
+
"--cuda-graph-bs",
|
1314
|
+
type=int,
|
1315
|
+
nargs="+",
|
1316
|
+
help="Set the list of batch sizes for cuda graph.",
|
1317
|
+
)
|
1149
1318
|
parser.add_argument(
|
1150
1319
|
"--disable-cuda-graph",
|
1151
1320
|
action="store_true",
|
@@ -1156,6 +1325,11 @@ class ServerArgs:
|
|
1156
1325
|
action="store_true",
|
1157
1326
|
help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
|
1158
1327
|
)
|
1328
|
+
parser.add_argument(
|
1329
|
+
"--enable-profile-cuda-graph",
|
1330
|
+
action="store_true",
|
1331
|
+
help="Enable profiling of cuda graph capture.",
|
1332
|
+
)
|
1159
1333
|
parser.add_argument(
|
1160
1334
|
"--enable-nccl-nvls",
|
1161
1335
|
action="store_true",
|
@@ -1186,6 +1360,11 @@ class ServerArgs:
|
|
1186
1360
|
action="store_true",
|
1187
1361
|
help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
|
1188
1362
|
)
|
1363
|
+
parser.add_argument(
|
1364
|
+
"--disable-overlap-cg-plan",
|
1365
|
+
action="store_true",
|
1366
|
+
help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
|
1367
|
+
)
|
1189
1368
|
parser.add_argument(
|
1190
1369
|
"--enable-mixed-chunk",
|
1191
1370
|
action="store_true",
|
@@ -1201,11 +1380,6 @@ class ServerArgs:
|
|
1201
1380
|
action="store_true",
|
1202
1381
|
help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
|
1203
1382
|
)
|
1204
|
-
parser.add_argument(
|
1205
|
-
"--enable-ep-moe",
|
1206
|
-
action="store_true",
|
1207
|
-
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
|
1208
|
-
)
|
1209
1383
|
parser.add_argument(
|
1210
1384
|
"--enable-two-batch-overlap",
|
1211
1385
|
action="store_true",
|
@@ -1222,18 +1396,6 @@ class ServerArgs:
|
|
1222
1396
|
default=ServerArgs.torch_compile_max_bs,
|
1223
1397
|
help="Set the maximum batch size when using torch compile.",
|
1224
1398
|
)
|
1225
|
-
parser.add_argument(
|
1226
|
-
"--cuda-graph-max-bs",
|
1227
|
-
type=int,
|
1228
|
-
default=ServerArgs.cuda_graph_max_bs,
|
1229
|
-
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
|
1230
|
-
)
|
1231
|
-
parser.add_argument(
|
1232
|
-
"--cuda-graph-bs",
|
1233
|
-
type=int,
|
1234
|
-
nargs="+",
|
1235
|
-
help="Set the list of batch sizes for cuda graph.",
|
1236
|
-
)
|
1237
1399
|
parser.add_argument(
|
1238
1400
|
"--torchao-config",
|
1239
1401
|
type=str,
|
@@ -1290,13 +1452,6 @@ class ServerArgs:
|
|
1290
1452
|
action="store_true",
|
1291
1453
|
help="Enable users to pass custom logit processors to the server (disabled by default for security)",
|
1292
1454
|
)
|
1293
|
-
parser.add_argument(
|
1294
|
-
"--tool-call-parser",
|
1295
|
-
type=str,
|
1296
|
-
choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
|
1297
|
-
default=ServerArgs.tool_call_parser,
|
1298
|
-
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
|
1299
|
-
)
|
1300
1455
|
parser.add_argument(
|
1301
1456
|
"--enable-hierarchical-cache",
|
1302
1457
|
action="store_true",
|
@@ -1322,86 +1477,9 @@ class ServerArgs:
|
|
1322
1477
|
help="The write policy of hierarchical cache.",
|
1323
1478
|
)
|
1324
1479
|
parser.add_argument(
|
1325
|
-
"--
|
1326
|
-
action="store_true",
|
1327
|
-
help="Enabling DeepEP MoE implementation for EP MoE.",
|
1328
|
-
)
|
1329
|
-
parser.add_argument(
|
1330
|
-
"--moe-dense-tp-size",
|
1331
|
-
type=int,
|
1332
|
-
default=ServerArgs.moe_dense_tp_size,
|
1333
|
-
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
1334
|
-
)
|
1335
|
-
parser.add_argument(
|
1336
|
-
"--deepep-mode",
|
1337
|
-
type=str,
|
1338
|
-
choices=["normal", "low_latency", "auto"],
|
1339
|
-
default="auto",
|
1340
|
-
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
|
1341
|
-
)
|
1342
|
-
parser.add_argument(
|
1343
|
-
"--ep-num-redundant-experts",
|
1344
|
-
type=int,
|
1345
|
-
default=ServerArgs.ep_num_redundant_experts,
|
1346
|
-
help="Allocate this number of redundant experts in expert parallel.",
|
1347
|
-
)
|
1348
|
-
parser.add_argument(
|
1349
|
-
"--ep-dispatch-algorithm",
|
1350
|
-
type=str,
|
1351
|
-
default=ServerArgs.ep_dispatch_algorithm,
|
1352
|
-
help="The algorithm to choose ranks for redundant experts in expert parallel.",
|
1353
|
-
)
|
1354
|
-
parser.add_argument(
|
1355
|
-
"--init-expert-location",
|
1356
|
-
type=str,
|
1357
|
-
default=ServerArgs.init_expert_location,
|
1358
|
-
help="Initial location of EP experts.",
|
1359
|
-
)
|
1360
|
-
parser.add_argument(
|
1361
|
-
"--enable-eplb",
|
1362
|
-
action="store_true",
|
1363
|
-
help="Enable EPLB algorithm",
|
1364
|
-
)
|
1365
|
-
parser.add_argument(
|
1366
|
-
"--eplb-algorithm",
|
1367
|
-
type=str,
|
1368
|
-
default=ServerArgs.eplb_algorithm,
|
1369
|
-
help="Chosen EPLB algorithm",
|
1370
|
-
)
|
1371
|
-
parser.add_argument(
|
1372
|
-
"--eplb-rebalance-num-iterations",
|
1373
|
-
type=int,
|
1374
|
-
default=ServerArgs.eplb_rebalance_num_iterations,
|
1375
|
-
help="Number of iterations to automatically trigger a EPLB re-balance.",
|
1376
|
-
)
|
1377
|
-
parser.add_argument(
|
1378
|
-
"--eplb-rebalance-layers-per-chunk",
|
1379
|
-
type=int,
|
1380
|
-
default=ServerArgs.eplb_rebalance_layers_per_chunk,
|
1381
|
-
help="Number of layers to rebalance per forward pass.",
|
1382
|
-
)
|
1383
|
-
parser.add_argument(
|
1384
|
-
"--expert-distribution-recorder-mode",
|
1385
|
-
type=str,
|
1386
|
-
default=ServerArgs.expert_distribution_recorder_mode,
|
1387
|
-
help="Mode of expert distribution recorder.",
|
1388
|
-
)
|
1389
|
-
parser.add_argument(
|
1390
|
-
"--expert-distribution-recorder-buffer-size",
|
1391
|
-
type=int,
|
1392
|
-
default=ServerArgs.expert_distribution_recorder_buffer_size,
|
1393
|
-
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
|
1394
|
-
)
|
1395
|
-
parser.add_argument(
|
1396
|
-
"--enable-expert-distribution-metrics",
|
1480
|
+
"--flashinfer-mla-disable-ragged",
|
1397
1481
|
action="store_true",
|
1398
|
-
help="
|
1399
|
-
)
|
1400
|
-
parser.add_argument(
|
1401
|
-
"--deepep-config",
|
1402
|
-
type=str,
|
1403
|
-
default=ServerArgs.deepep_config,
|
1404
|
-
help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
|
1482
|
+
help="Not using ragged prefill wrapper when running flashinfer mla",
|
1405
1483
|
)
|
1406
1484
|
parser.add_argument(
|
1407
1485
|
"--disable-shared-experts-fusion",
|
@@ -1418,8 +1496,11 @@ class ServerArgs:
|
|
1418
1496
|
action="store_true",
|
1419
1497
|
help="Adopt base image processor instead of fast image processor.",
|
1420
1498
|
)
|
1421
|
-
|
1422
|
-
|
1499
|
+
parser.add_argument(
|
1500
|
+
"--enable-return-hidden-states",
|
1501
|
+
action="store_true",
|
1502
|
+
help="Enable returning hidden states with responses.",
|
1503
|
+
)
|
1423
1504
|
parser.add_argument(
|
1424
1505
|
"--warmups",
|
1425
1506
|
type=str,
|
@@ -1447,6 +1528,11 @@ class ServerArgs:
|
|
1447
1528
|
default=ServerArgs.debug_tensor_dump_inject,
|
1448
1529
|
help="Inject the outputs from jax as the input of every layer.",
|
1449
1530
|
)
|
1531
|
+
parser.add_argument(
|
1532
|
+
"--debug-tensor-dump-prefill-only",
|
1533
|
+
action="store_true",
|
1534
|
+
help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
|
1535
|
+
)
|
1450
1536
|
|
1451
1537
|
# Disaggregation
|
1452
1538
|
parser.add_argument(
|
@@ -1456,6 +1542,13 @@ class ServerArgs:
|
|
1456
1542
|
choices=["null", "prefill", "decode"],
|
1457
1543
|
help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
|
1458
1544
|
)
|
1545
|
+
parser.add_argument(
|
1546
|
+
"--disaggregation-transfer-backend",
|
1547
|
+
type=str,
|
1548
|
+
default=ServerArgs.disaggregation_transfer_backend,
|
1549
|
+
choices=["mooncake", "nixl"],
|
1550
|
+
help="The backend for disaggregation transfer. Default is mooncake.",
|
1551
|
+
)
|
1459
1552
|
parser.add_argument(
|
1460
1553
|
"--disaggregation-bootstrap-port",
|
1461
1554
|
type=int,
|
@@ -1463,11 +1556,22 @@ class ServerArgs:
|
|
1463
1556
|
help="Bootstrap server port on the prefill server. Default is 8998.",
|
1464
1557
|
)
|
1465
1558
|
parser.add_argument(
|
1466
|
-
"--disaggregation-
|
1467
|
-
type=
|
1468
|
-
default=ServerArgs.
|
1469
|
-
|
1470
|
-
|
1559
|
+
"--disaggregation-decode-tp",
|
1560
|
+
type=int,
|
1561
|
+
default=ServerArgs.disaggregation_decode_tp,
|
1562
|
+
help="Decode tp size. If not set, it matches the tp size of the current engine. This is only set on the prefill server.",
|
1563
|
+
)
|
1564
|
+
parser.add_argument(
|
1565
|
+
"--disaggregation-decode-dp",
|
1566
|
+
type=int,
|
1567
|
+
default=ServerArgs.disaggregation_decode_dp,
|
1568
|
+
help="Decode dp size. If not set, it matches the dp size of the current engine. This is only set on the prefill server.",
|
1569
|
+
)
|
1570
|
+
parser.add_argument(
|
1571
|
+
"--disaggregation-prefill-pp",
|
1572
|
+
type=int,
|
1573
|
+
default=ServerArgs.disaggregation_prefill_pp,
|
1574
|
+
help="Prefill pp size. If not set, it is default to 1. This is only set on the decode server.",
|
1471
1575
|
)
|
1472
1576
|
parser.add_argument(
|
1473
1577
|
"--disaggregation-ib-device",
|
@@ -1477,19 +1581,29 @@ class ServerArgs:
|
|
1477
1581
|
"or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
|
1478
1582
|
"Default is None, which triggers automatic device detection when mooncake backend is enabled.",
|
1479
1583
|
)
|
1584
|
+
parser.add_argument(
|
1585
|
+
"--num-reserved-decode-tokens",
|
1586
|
+
type=int,
|
1587
|
+
default=ServerArgs.num_reserved_decode_tokens,
|
1588
|
+
help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
|
1589
|
+
)
|
1480
1590
|
parser.add_argument(
|
1481
1591
|
"--pdlb-url",
|
1482
1592
|
type=str,
|
1483
1593
|
default=None,
|
1484
1594
|
help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
|
1485
1595
|
)
|
1486
|
-
|
1487
1596
|
parser.add_argument(
|
1488
|
-
"--
|
1597
|
+
"--custom-weight-loader",
|
1489
1598
|
type=str,
|
1490
|
-
|
1491
|
-
default=
|
1492
|
-
help="
|
1599
|
+
nargs="*",
|
1600
|
+
default=None,
|
1601
|
+
help="The custom dataloader which used to update the model. Should be set with a valid import path, such as my_package.weight_load_func",
|
1602
|
+
)
|
1603
|
+
parser.add_argument(
|
1604
|
+
"--weight-loader-disable-mmap",
|
1605
|
+
action="store_true",
|
1606
|
+
help="Disable mmap while loading weight using safetensors.",
|
1493
1607
|
)
|
1494
1608
|
|
1495
1609
|
@classmethod
|
@@ -1615,9 +1729,8 @@ class PortArgs:
|
|
1615
1729
|
dist_init_host, dist_init_port = dist_init_addr
|
1616
1730
|
port_base = int(dist_init_port) + 1
|
1617
1731
|
if dp_rank is None:
|
1618
|
-
|
1619
|
-
|
1620
|
-
) # TokenizerManager to DataParallelController
|
1732
|
+
# TokenizerManager to DataParallelController
|
1733
|
+
scheduler_input_port = port_base + 3
|
1621
1734
|
else:
|
1622
1735
|
scheduler_input_port = port_base + 3 + 1 + dp_rank
|
1623
1736
|
|