sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +2 -1
- sglang/eval/loogle_eval.py +7 -0
- sglang/srt/_custom_ops.py +29 -1
- sglang/srt/configs/deepseekvl2.py +11 -2
- sglang/srt/configs/internvl.py +3 -0
- sglang/srt/configs/janus_pro.py +3 -0
- sglang/srt/configs/model_config.py +10 -8
- sglang/srt/configs/update_config.py +3 -1
- sglang/srt/conversation.py +2 -1
- sglang/srt/custom_op.py +5 -2
- sglang/srt/disaggregation/common/conn.py +34 -6
- sglang/srt/disaggregation/decode.py +9 -1
- sglang/srt/disaggregation/mini_lb.py +3 -2
- sglang/srt/disaggregation/mooncake/conn.py +93 -76
- sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
- sglang/srt/disaggregation/nixl/conn.py +17 -13
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
- sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
- sglang/srt/distributed/parallel_state.py +103 -15
- sglang/srt/entrypoints/engine.py +31 -33
- sglang/srt/entrypoints/http_server.py +20 -32
- sglang/srt/entrypoints/openai/protocol.py +3 -3
- sglang/srt/entrypoints/openai/serving_chat.py +48 -6
- sglang/srt/eplb/expert_location_dispatch.py +1 -1
- sglang/srt/function_call/base_format_detector.py +74 -12
- sglang/srt/function_call/deepseekv3_detector.py +26 -11
- sglang/srt/function_call/ebnf_composer.py +95 -63
- sglang/srt/function_call/function_call_parser.py +4 -2
- sglang/srt/function_call/kimik2_detector.py +41 -16
- sglang/srt/function_call/llama32_detector.py +6 -3
- sglang/srt/function_call/mistral_detector.py +11 -3
- sglang/srt/function_call/pythonic_detector.py +16 -14
- sglang/srt/function_call/qwen25_detector.py +12 -3
- sglang/srt/function_call/qwen3_coder_detector.py +151 -0
- sglang/srt/hf_transformers_utils.py +0 -1
- sglang/srt/layers/activation.py +24 -3
- sglang/srt/layers/attention/base_attn_backend.py +3 -1
- sglang/srt/layers/attention/flashattention_backend.py +3 -3
- sglang/srt/layers/attention/flashinfer_backend.py +40 -1
- sglang/srt/layers/communicator.py +12 -12
- sglang/srt/layers/dp_attention.py +72 -24
- sglang/srt/layers/linear.py +13 -102
- sglang/srt/layers/logits_processor.py +34 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
- sglang/srt/layers/moe/ep_moe/layer.py +23 -402
- sglang/srt/layers/moe/fused_moe_native.py +7 -47
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +54 -263
- sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
- sglang/srt/layers/moe/topk.py +190 -23
- sglang/srt/layers/quantization/__init__.py +20 -134
- sglang/srt/layers/quantization/awq.py +578 -11
- sglang/srt/layers/quantization/awq_triton.py +339 -0
- sglang/srt/layers/quantization/base_config.py +85 -10
- sglang/srt/layers/quantization/blockwise_int8.py +17 -55
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +23 -79
- sglang/srt/layers/quantization/fp8.py +273 -62
- sglang/srt/layers/quantization/fp8_kernel.py +210 -46
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/gptq.py +501 -143
- sglang/srt/layers/quantization/marlin_utils.py +790 -0
- sglang/srt/layers/quantization/modelopt_quant.py +34 -112
- sglang/srt/layers/quantization/moe_wna16.py +45 -49
- sglang/srt/layers/quantization/petit.py +252 -0
- sglang/srt/layers/quantization/petit_utils.py +104 -0
- sglang/srt/layers/quantization/qoq.py +7 -6
- sglang/srt/layers/quantization/scalar_type.py +352 -0
- sglang/srt/layers/quantization/unquant.py +422 -0
- sglang/srt/layers/quantization/utils.py +340 -9
- sglang/srt/layers/quantization/w4afp8.py +8 -4
- sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
- sglang/srt/layers/quantization/w8a8_int8.py +51 -115
- sglang/srt/layers/radix_attention.py +5 -3
- sglang/srt/layers/vocab_parallel_embedding.py +1 -41
- sglang/srt/lora/lora.py +0 -4
- sglang/srt/lora/lora_manager.py +162 -164
- sglang/srt/lora/lora_registry.py +124 -0
- sglang/srt/lora/mem_pool.py +83 -35
- sglang/srt/lora/utils.py +12 -5
- sglang/srt/managers/cache_controller.py +288 -0
- sglang/srt/managers/io_struct.py +60 -30
- sglang/srt/managers/mm_utils.py +7 -8
- sglang/srt/managers/schedule_batch.py +163 -113
- sglang/srt/managers/schedule_policy.py +68 -27
- sglang/srt/managers/scheduler.py +256 -86
- sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
- sglang/srt/managers/tokenizer_manager.py +38 -27
- sglang/srt/managers/tp_worker.py +16 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
- sglang/srt/mem_cache/allocator.py +74 -23
- sglang/srt/mem_cache/base_prefix_cache.py +14 -2
- sglang/srt/mem_cache/chunk_cache.py +5 -2
- sglang/srt/mem_cache/hicache_storage.py +168 -0
- sglang/srt/mem_cache/hiradix_cache.py +194 -5
- sglang/srt/mem_cache/memory_pool.py +16 -1
- sglang/srt/mem_cache/memory_pool_host.py +44 -2
- sglang/srt/mem_cache/radix_cache.py +26 -0
- sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +66 -31
- sglang/srt/model_executor/forward_batch_info.py +210 -25
- sglang/srt/model_executor/model_runner.py +147 -42
- sglang/srt/model_loader/loader.py +7 -1
- sglang/srt/model_loader/utils.py +4 -4
- sglang/srt/models/clip.py +1 -1
- sglang/srt/models/deepseek.py +9 -6
- sglang/srt/models/deepseek_janus_pro.py +1 -1
- sglang/srt/models/deepseek_v2.py +192 -173
- sglang/srt/models/deepseek_vl2.py +5 -5
- sglang/srt/models/gemma.py +48 -0
- sglang/srt/models/gemma2.py +52 -0
- sglang/srt/models/gemma3_causal.py +63 -0
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -4
- sglang/srt/models/granitemoe.py +385 -0
- sglang/srt/models/grok.py +9 -3
- sglang/srt/models/hunyuan.py +63 -16
- sglang/srt/models/internvl.py +1 -1
- sglang/srt/models/kimi_vl.py +1 -1
- sglang/srt/models/llama.py +41 -0
- sglang/srt/models/llama4.py +11 -11
- sglang/srt/models/llava.py +2 -2
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpm.py +0 -2
- sglang/srt/models/minicpmo.py +3 -7
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mistral.py +1 -1
- sglang/srt/models/mixtral.py +9 -2
- sglang/srt/models/mllama.py +3 -5
- sglang/srt/models/mllama4.py +13 -6
- sglang/srt/models/olmoe.py +8 -5
- sglang/srt/models/persimmon.py +330 -0
- sglang/srt/models/phi.py +321 -0
- sglang/srt/models/phi4mm.py +44 -4
- sglang/srt/models/phi4mm_audio.py +1260 -0
- sglang/srt/models/phi4mm_utils.py +1917 -0
- sglang/srt/models/phimoe.py +9 -3
- sglang/srt/models/qwen.py +37 -0
- sglang/srt/models/qwen2.py +41 -0
- sglang/srt/models/qwen2_5_vl.py +4 -4
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +53 -9
- sglang/srt/models/qwen2_vl.py +4 -4
- sglang/srt/models/qwen3.py +65 -1
- sglang/srt/models/qwen3_moe.py +57 -24
- sglang/srt/models/vila.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +91 -97
- sglang/srt/multimodal/processors/clip.py +21 -19
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
- sglang/srt/multimodal/processors/gemma3.py +13 -17
- sglang/srt/multimodal/processors/gemma3n.py +19 -23
- sglang/srt/multimodal/processors/internvl.py +9 -10
- sglang/srt/multimodal/processors/janus_pro.py +12 -27
- sglang/srt/multimodal/processors/kimi_vl.py +12 -14
- sglang/srt/multimodal/processors/llava.py +4 -2
- sglang/srt/multimodal/processors/minicpm.py +35 -44
- sglang/srt/multimodal/processors/mlama.py +21 -18
- sglang/srt/multimodal/processors/mllama4.py +4 -5
- sglang/srt/multimodal/processors/phi4mm.py +63 -39
- sglang/srt/multimodal/processors/pixtral.py +14 -35
- sglang/srt/multimodal/processors/qwen_audio.py +65 -0
- sglang/srt/multimodal/processors/qwen_vl.py +16 -21
- sglang/srt/multimodal/processors/vila.py +14 -14
- sglang/srt/reasoning_parser.py +46 -4
- sglang/srt/sampling/sampling_batch_info.py +6 -5
- sglang/srt/sampling/sampling_params.py +8 -1
- sglang/srt/server_args.py +454 -270
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +46 -37
- sglang/srt/speculative/eagle_utils.py +51 -23
- sglang/srt/speculative/eagle_worker.py +59 -44
- sglang/srt/two_batch_overlap.py +10 -5
- sglang/srt/utils.py +44 -69
- sglang/test/runners.py +14 -3
- sglang/test/test_activation.py +50 -1
- sglang/test/test_block_fp8.py +8 -3
- sglang/test/test_block_fp8_ep.py +1 -1
- sglang/test/test_custom_ops.py +12 -7
- sglang/test/test_cutlass_w4a8_moe.py +1 -3
- sglang/test/test_fp4_moe.py +1 -3
- sglang/test/test_marlin_moe.py +286 -0
- sglang/test/test_marlin_utils.py +171 -0
- sglang/test/test_utils.py +35 -0
- sglang/version.py +1 -1
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +10 -10
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +198 -175
- sglang/srt/layers/quantization/quant_utils.py +0 -166
- sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -23,8 +23,11 @@ import tempfile
|
|
23
23
|
from typing import List, Literal, Optional, Union
|
24
24
|
|
25
25
|
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
26
|
+
from sglang.srt.lora.lora_registry import LoRARef
|
26
27
|
from sglang.srt.reasoning_parser import ReasoningParser
|
27
28
|
from sglang.srt.utils import (
|
29
|
+
LORA_TARGET_ALL_MODULES,
|
30
|
+
SUPPORTED_LORA_TARGET_MODULES,
|
28
31
|
configure_ipv6,
|
29
32
|
get_device,
|
30
33
|
get_device_memory_capacity,
|
@@ -46,30 +49,28 @@ class ServerArgs:
|
|
46
49
|
tokenizer_path: Optional[str] = None
|
47
50
|
tokenizer_mode: str = "auto"
|
48
51
|
skip_tokenizer_init: bool = False
|
49
|
-
skip_server_warmup: bool = False
|
50
52
|
load_format: str = "auto"
|
51
53
|
model_loader_extra_config: str = "{}"
|
52
54
|
trust_remote_code: bool = False
|
53
|
-
dtype: str = "auto"
|
54
|
-
kv_cache_dtype: str = "auto"
|
55
|
-
quantization: Optional[str] = None
|
56
|
-
quantization_param_path: Optional[str] = None
|
57
55
|
context_length: Optional[int] = None
|
58
|
-
device: Optional[str] = None
|
59
|
-
served_model_name: Optional[str] = None
|
60
|
-
chat_template: Optional[str] = None
|
61
|
-
completion_template: Optional[str] = None
|
62
56
|
is_embedding: bool = False
|
63
57
|
enable_multimodal: Optional[bool] = None
|
64
58
|
revision: Optional[str] = None
|
65
|
-
|
66
|
-
impl: str = "auto"
|
59
|
+
model_impl: str = "auto"
|
67
60
|
|
68
|
-
#
|
61
|
+
# HTTP server
|
69
62
|
host: str = "127.0.0.1"
|
70
63
|
port: int = 30000
|
64
|
+
skip_server_warmup: bool = False
|
65
|
+
warmups: Optional[str] = None
|
71
66
|
nccl_port: Optional[int] = None
|
72
67
|
|
68
|
+
# Quantization and data type
|
69
|
+
dtype: str = "auto"
|
70
|
+
quantization: Optional[str] = None
|
71
|
+
quantization_param_path: Optional[str] = None
|
72
|
+
kv_cache_dtype: str = "auto"
|
73
|
+
|
73
74
|
# Memory and scheduling
|
74
75
|
mem_fraction_static: Optional[float] = None
|
75
76
|
max_running_requests: Optional[int] = None
|
@@ -79,9 +80,13 @@ class ServerArgs:
|
|
79
80
|
schedule_policy: str = "fcfs"
|
80
81
|
schedule_conservativeness: float = 1.0
|
81
82
|
cpu_offload_gb: int = 0
|
82
|
-
page_size: int =
|
83
|
+
page_size: Optional[int] = None
|
84
|
+
hybrid_kvcache_ratio: Optional[float] = None
|
85
|
+
swa_full_tokens_ratio: float = 0.8
|
86
|
+
disable_hybrid_swa_memory: bool = False
|
83
87
|
|
84
|
-
#
|
88
|
+
# Runtime options
|
89
|
+
device: Optional[str] = None
|
85
90
|
tp_size: int = 1
|
86
91
|
pp_size: int = 1
|
87
92
|
max_micro_batch_size: Optional[int] = None
|
@@ -104,9 +109,10 @@ class ServerArgs:
|
|
104
109
|
crash_dump_folder: Optional[str] = None
|
105
110
|
show_time_cost: bool = False
|
106
111
|
enable_metrics: bool = False
|
112
|
+
enable_metrics_for_all_schedulers: bool = False
|
107
113
|
bucket_time_to_first_token: Optional[List[float]] = None
|
108
|
-
bucket_e2e_request_latency: Optional[List[float]] = None
|
109
114
|
bucket_inter_token_latency: Optional[List[float]] = None
|
115
|
+
bucket_e2e_request_latency: Optional[List[float]] = None
|
110
116
|
collect_tokens_histogram: bool = False
|
111
117
|
decode_log_interval: int = 40
|
112
118
|
enable_request_time_stats_logging: bool = False
|
@@ -114,6 +120,9 @@ class ServerArgs:
|
|
114
120
|
|
115
121
|
# API related
|
116
122
|
api_key: Optional[str] = None
|
123
|
+
served_model_name: Optional[str] = None
|
124
|
+
chat_template: Optional[str] = None
|
125
|
+
completion_template: Optional[str] = None
|
117
126
|
file_storage_path: str = "sglang_storage"
|
118
127
|
enable_cache_report: bool = False
|
119
128
|
reasoning_parser: Optional[str] = None
|
@@ -133,7 +142,10 @@ class ServerArgs:
|
|
133
142
|
preferred_sampling_params: Optional[str] = None
|
134
143
|
|
135
144
|
# LoRA
|
136
|
-
|
145
|
+
enable_lora: Optional[bool] = None
|
146
|
+
max_lora_rank: Optional[int] = None
|
147
|
+
lora_target_modules: Optional[Union[set[str], List[str]]] = None
|
148
|
+
lora_paths: Optional[Union[dict[str, str], dict[str, LoRARef], List[str]]] = None
|
137
149
|
max_loras_per_batch: int = 8
|
138
150
|
lora_backend: str = "triton"
|
139
151
|
|
@@ -175,6 +187,14 @@ class ServerArgs:
|
|
175
187
|
deepep_config: Optional[str] = None
|
176
188
|
moe_dense_tp_size: Optional[int] = None
|
177
189
|
|
190
|
+
# Hierarchical cache
|
191
|
+
enable_hierarchical_cache: bool = False
|
192
|
+
hicache_ratio: float = 2.0
|
193
|
+
hicache_size: int = 0
|
194
|
+
hicache_write_policy: str = "write_through_selective"
|
195
|
+
hicache_io_backend: str = ""
|
196
|
+
hicache_storage_backend: Optional[str] = None
|
197
|
+
|
178
198
|
# Double Sparsity
|
179
199
|
enable_double_sparsity: bool = False
|
180
200
|
ds_channel_config_path: Optional[str] = None
|
@@ -196,7 +216,6 @@ class ServerArgs:
|
|
196
216
|
disable_custom_all_reduce: bool = False
|
197
217
|
enable_mscclpp: bool = False
|
198
218
|
disable_overlap_schedule: bool = False
|
199
|
-
disable_overlap_cg_plan: bool = False
|
200
219
|
enable_mixed_chunk: bool = False
|
201
220
|
enable_dp_attention: bool = False
|
202
221
|
enable_dp_lm_head: bool = False
|
@@ -213,18 +232,12 @@ class ServerArgs:
|
|
213
232
|
enable_memory_saver: bool = False
|
214
233
|
allow_auto_truncate: bool = False
|
215
234
|
enable_custom_logit_processor: bool = False
|
216
|
-
enable_hierarchical_cache: bool = False
|
217
|
-
hicache_ratio: float = 2.0
|
218
|
-
hicache_size: int = 0
|
219
|
-
hicache_write_policy: str = "write_through_selective"
|
220
|
-
hicache_io_backend: str = ""
|
221
235
|
flashinfer_mla_disable_ragged: bool = False
|
222
236
|
disable_shared_experts_fusion: bool = False
|
223
237
|
disable_chunked_prefix_cache: bool = False
|
224
238
|
disable_fast_image_processor: bool = False
|
225
239
|
enable_return_hidden_states: bool = False
|
226
240
|
enable_triton_kernel_moe: bool = False
|
227
|
-
warmups: Optional[str] = None
|
228
241
|
|
229
242
|
# Debug tensor dumps
|
230
243
|
debug_tensor_dump_output_folder: Optional[str] = None
|
@@ -232,7 +245,7 @@ class ServerArgs:
|
|
232
245
|
debug_tensor_dump_inject: bool = False
|
233
246
|
debug_tensor_dump_prefill_only: bool = False
|
234
247
|
|
235
|
-
#
|
248
|
+
# PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
|
236
249
|
disaggregation_mode: str = "null"
|
237
250
|
disaggregation_transfer_backend: str = "mooncake"
|
238
251
|
disaggregation_bootstrap_port: int = 8998
|
@@ -247,32 +260,26 @@ class ServerArgs:
|
|
247
260
|
custom_weight_loader: Optional[List[str]] = None
|
248
261
|
weight_loader_disable_mmap: bool = False
|
249
262
|
|
263
|
+
# For PD-Multiplexing
|
264
|
+
enable_pdmux: bool = False
|
265
|
+
sm_group_num: int = 3
|
266
|
+
|
250
267
|
def __post_init__(self):
|
251
268
|
# Expert parallelism
|
269
|
+
# We put it here first due to some internal ckpt conversation issues.
|
252
270
|
if self.enable_ep_moe:
|
253
271
|
self.ep_size = self.tp_size
|
254
272
|
logger.warning(
|
255
273
|
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
256
274
|
)
|
257
|
-
|
258
|
-
assert (
|
259
|
-
self.quantization == "modelopt_fp4"
|
260
|
-
), "modelopt_fp4 quantization is required for Flashinfer MOE"
|
261
|
-
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
262
|
-
self.disable_shared_experts_fusion = True
|
263
|
-
logger.warning(
|
264
|
-
f"Flashinfer MoE is enabled. Shared expert fusion is disabled."
|
265
|
-
)
|
275
|
+
|
266
276
|
# Set missing default values
|
267
277
|
if self.tokenizer_path is None:
|
268
278
|
self.tokenizer_path = self.model_path
|
269
|
-
|
270
|
-
if self.device is None:
|
271
|
-
self.device = get_device()
|
272
|
-
|
273
279
|
if self.served_model_name is None:
|
274
280
|
self.served_model_name = self.model_path
|
275
|
-
|
281
|
+
if self.device is None:
|
282
|
+
self.device = get_device()
|
276
283
|
if self.random_seed is None:
|
277
284
|
self.random_seed = random.randint(0, 1 << 30)
|
278
285
|
|
@@ -323,12 +330,12 @@ class ServerArgs:
|
|
323
330
|
self.mem_fraction_static = 0.88
|
324
331
|
|
325
332
|
# Lazy init to avoid circular import
|
333
|
+
# Multimodal models need more memory for the image processor
|
326
334
|
from sglang.srt.configs.model_config import ModelConfig
|
327
335
|
|
328
|
-
# Multimodal models need more memory for the image processor
|
329
336
|
model_config = ModelConfig.from_server_args(self)
|
330
337
|
if model_config.is_multimodal:
|
331
|
-
self.
|
338
|
+
self.adjust_mem_fraction_for_vlm(model_config)
|
332
339
|
|
333
340
|
# Set chunked prefill size, which depends on the gpu memory capacity
|
334
341
|
if self.chunked_prefill_size is None:
|
@@ -341,7 +348,6 @@ class ServerArgs:
|
|
341
348
|
self.chunked_prefill_size = 16384
|
342
349
|
else:
|
343
350
|
self.chunked_prefill_size = 4096
|
344
|
-
assert self.chunked_prefill_size % self.page_size == 0
|
345
351
|
|
346
352
|
# Set cuda graph max batch size
|
347
353
|
if self.cuda_graph_max_bs is None:
|
@@ -352,23 +358,6 @@ class ServerArgs:
|
|
352
358
|
else:
|
353
359
|
self.cuda_graph_max_bs = 80
|
354
360
|
|
355
|
-
assert self.moe_dense_tp_size in {
|
356
|
-
1,
|
357
|
-
None,
|
358
|
-
}, "moe_dense_tp_size only support 1 and None currently"
|
359
|
-
|
360
|
-
if self.attention_backend == "flashmla":
|
361
|
-
logger.warning(
|
362
|
-
"FlashMLA only supports a page_size of 64, change page_size to 64."
|
363
|
-
)
|
364
|
-
self.page_size = 64
|
365
|
-
|
366
|
-
if self.attention_backend == "cutlass_mla":
|
367
|
-
logger.warning(
|
368
|
-
"Cutlass MLA only supports a page_size of 128, change page_size to 128."
|
369
|
-
)
|
370
|
-
self.page_size = 128
|
371
|
-
|
372
361
|
# Set kernel backends for hpu device
|
373
362
|
if self.device == "hpu":
|
374
363
|
self.attention_backend = "torch_native"
|
@@ -397,6 +386,26 @@ class ServerArgs:
|
|
397
386
|
)
|
398
387
|
self.page_size = 128
|
399
388
|
|
389
|
+
if self.attention_backend == "flashmla":
|
390
|
+
logger.warning(
|
391
|
+
"FlashMLA only supports a page_size of 64, change page_size to 64."
|
392
|
+
)
|
393
|
+
self.page_size = 64
|
394
|
+
|
395
|
+
if self.attention_backend == "cutlass_mla":
|
396
|
+
logger.warning(
|
397
|
+
"Cutlass MLA only supports a page_size of 128, change page_size to 128."
|
398
|
+
)
|
399
|
+
self.page_size = 128
|
400
|
+
|
401
|
+
# Set page size
|
402
|
+
if self.page_size is None:
|
403
|
+
self.page_size = 1
|
404
|
+
|
405
|
+
# AMD-specific Triton attention KV splits default number
|
406
|
+
if is_hip():
|
407
|
+
self.triton_attention_num_kv_splits = 16
|
408
|
+
|
400
409
|
# Choose grammar backend
|
401
410
|
if self.grammar_backend is None:
|
402
411
|
self.grammar_backend = "xgrammar"
|
@@ -418,6 +427,13 @@ class ServerArgs:
|
|
418
427
|
self.enable_dp_attention
|
419
428
|
), "Please enable dp attention when setting enable_dp_lm_head. "
|
420
429
|
|
430
|
+
# MoE kernel
|
431
|
+
if self.enable_flashinfer_moe:
|
432
|
+
assert (
|
433
|
+
self.quantization == "modelopt_fp4"
|
434
|
+
), "modelopt_fp4 quantization is required for Flashinfer MOE"
|
435
|
+
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
436
|
+
|
421
437
|
# DeepEP MoE
|
422
438
|
if self.enable_deepep_moe:
|
423
439
|
if self.deepep_mode == "normal":
|
@@ -428,12 +444,6 @@ class ServerArgs:
|
|
428
444
|
f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
429
445
|
)
|
430
446
|
|
431
|
-
if self.pp_size > 1:
|
432
|
-
self.disable_overlap_schedule = True
|
433
|
-
logger.warning(
|
434
|
-
"Pipeline parallelism is incompatible with overlap schedule."
|
435
|
-
)
|
436
|
-
|
437
447
|
if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
|
438
448
|
self.expert_distribution_recorder_mode = "stat"
|
439
449
|
logger.info(
|
@@ -459,6 +469,13 @@ class ServerArgs:
|
|
459
469
|
elif self.expert_distribution_recorder_mode is not None:
|
460
470
|
self.expert_distribution_recorder_buffer_size = 1000
|
461
471
|
|
472
|
+
# Pipeline parallelism
|
473
|
+
if self.pp_size > 1:
|
474
|
+
self.disable_overlap_schedule = True
|
475
|
+
logger.warning(
|
476
|
+
"Pipeline parallelism is incompatible with overlap schedule."
|
477
|
+
)
|
478
|
+
|
462
479
|
# Speculative Decoding
|
463
480
|
if self.speculative_algorithm == "NEXTN":
|
464
481
|
# NEXTN shares the same implementation of EAGLE
|
@@ -479,10 +496,9 @@ class ServerArgs:
|
|
479
496
|
"eagle speculative decoding."
|
480
497
|
)
|
481
498
|
|
482
|
-
model_arch =
|
483
|
-
|
484
|
-
# Auto set draft_model_path DeepSeek-V3/R1
|
499
|
+
model_arch = self.get_hf_config().architectures[0]
|
485
500
|
if model_arch == "DeepseekV3ForCausalLM":
|
501
|
+
# Auto set draft_model_path DeepSeek-V3/R1
|
486
502
|
if self.speculative_draft_model_path is None:
|
487
503
|
self.speculative_draft_model_path = self.model_path
|
488
504
|
else:
|
@@ -521,12 +537,11 @@ class ServerArgs:
|
|
521
537
|
) and check_gguf_file(self.model_path):
|
522
538
|
self.quantization = self.load_format = "gguf"
|
523
539
|
|
540
|
+
# Model loading
|
524
541
|
if is_remote_url(self.model_path):
|
525
542
|
self.load_format = "remote"
|
526
|
-
|
527
|
-
|
528
|
-
if is_hip():
|
529
|
-
self.triton_attention_num_kv_splits = 16
|
543
|
+
if self.custom_weight_loader is None:
|
544
|
+
self.custom_weight_loader = []
|
530
545
|
|
531
546
|
# PD disaggregation
|
532
547
|
if self.disaggregation_mode == "decode":
|
@@ -551,6 +566,7 @@ class ServerArgs:
|
|
551
566
|
self.disable_cuda_graph = True
|
552
567
|
logger.warning("Cuda graph is disabled for prefill server")
|
553
568
|
|
569
|
+
# Propagate env vars
|
554
570
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
555
571
|
"1" if self.enable_torch_compile else "0"
|
556
572
|
)
|
@@ -559,20 +575,9 @@ class ServerArgs:
|
|
559
575
|
"1" if self.disable_outlines_disk_cache else "0"
|
560
576
|
)
|
561
577
|
|
562
|
-
if self.custom_weight_loader is None:
|
563
|
-
self.custom_weight_loader = []
|
564
|
-
|
565
|
-
def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
|
566
|
-
larger_tp = max(decode_tp, prefill_tp)
|
567
|
-
smaller_tp = min(decode_tp, prefill_tp)
|
568
|
-
assert larger_tp % smaller_tp == 0, (
|
569
|
-
"Different tp size is supported only when one tp is multiple of the other. "
|
570
|
-
f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
|
571
|
-
)
|
572
|
-
|
573
578
|
@staticmethod
|
574
579
|
def add_cli_args(parser: argparse.ArgumentParser):
|
575
|
-
# Model and
|
580
|
+
# Model and tokenizer
|
576
581
|
parser.add_argument(
|
577
582
|
"--model-path",
|
578
583
|
"--model",
|
@@ -586,24 +591,6 @@ class ServerArgs:
|
|
586
591
|
default=ServerArgs.tokenizer_path,
|
587
592
|
help="The path of the tokenizer.",
|
588
593
|
)
|
589
|
-
parser.add_argument(
|
590
|
-
"--host",
|
591
|
-
type=str,
|
592
|
-
default=ServerArgs.host,
|
593
|
-
help="The host of the HTTP server.",
|
594
|
-
)
|
595
|
-
parser.add_argument(
|
596
|
-
"--port",
|
597
|
-
type=int,
|
598
|
-
default=ServerArgs.port,
|
599
|
-
help="The port of the HTTP server.",
|
600
|
-
)
|
601
|
-
parser.add_argument(
|
602
|
-
"--nccl-port",
|
603
|
-
type=int,
|
604
|
-
default=ServerArgs.nccl_port,
|
605
|
-
help="The port for NCCL distributed environment setup. Defaults to a random port.",
|
606
|
-
)
|
607
594
|
parser.add_argument(
|
608
595
|
"--tokenizer-mode",
|
609
596
|
type=str,
|
@@ -618,11 +605,6 @@ class ServerArgs:
|
|
618
605
|
action="store_true",
|
619
606
|
help="If set, skip init tokenizer and pass input_ids in generate request.",
|
620
607
|
)
|
621
|
-
parser.add_argument(
|
622
|
-
"--skip-server-warmup",
|
623
|
-
action="store_true",
|
624
|
-
help="If set, skip warmup.",
|
625
|
-
)
|
626
608
|
parser.add_argument(
|
627
609
|
"--load-format",
|
628
610
|
type=str,
|
@@ -668,6 +650,77 @@ class ServerArgs:
|
|
668
650
|
action="store_true",
|
669
651
|
help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
|
670
652
|
)
|
653
|
+
parser.add_argument(
|
654
|
+
"--context-length",
|
655
|
+
type=int,
|
656
|
+
default=ServerArgs.context_length,
|
657
|
+
help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
|
658
|
+
)
|
659
|
+
parser.add_argument(
|
660
|
+
"--is-embedding",
|
661
|
+
action="store_true",
|
662
|
+
help="Whether to use a CausalLM as an embedding model.",
|
663
|
+
)
|
664
|
+
parser.add_argument(
|
665
|
+
"--enable-multimodal",
|
666
|
+
default=ServerArgs.enable_multimodal,
|
667
|
+
action="store_true",
|
668
|
+
help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
|
669
|
+
)
|
670
|
+
parser.add_argument(
|
671
|
+
"--revision",
|
672
|
+
type=str,
|
673
|
+
default=None,
|
674
|
+
help="The specific model version to use. It can be a branch "
|
675
|
+
"name, a tag name, or a commit id. If unspecified, will use "
|
676
|
+
"the default version.",
|
677
|
+
)
|
678
|
+
parser.add_argument(
|
679
|
+
"--model-impl",
|
680
|
+
type=str,
|
681
|
+
default=ServerArgs.model_impl,
|
682
|
+
help="Which implementation of the model to use.\n\n"
|
683
|
+
'* "auto" will try to use the SGLang implementation if it exists '
|
684
|
+
"and fall back to the Transformers implementation if no SGLang "
|
685
|
+
"implementation is available.\n"
|
686
|
+
'* "sglang" will use the SGLang model implementation.\n'
|
687
|
+
'* "transformers" will use the Transformers model '
|
688
|
+
"implementation.\n",
|
689
|
+
)
|
690
|
+
|
691
|
+
# HTTP server
|
692
|
+
parser.add_argument(
|
693
|
+
"--host",
|
694
|
+
type=str,
|
695
|
+
default=ServerArgs.host,
|
696
|
+
help="The host of the HTTP server.",
|
697
|
+
)
|
698
|
+
parser.add_argument(
|
699
|
+
"--port",
|
700
|
+
type=int,
|
701
|
+
default=ServerArgs.port,
|
702
|
+
help="The port of the HTTP server.",
|
703
|
+
)
|
704
|
+
parser.add_argument(
|
705
|
+
"--skip-server-warmup",
|
706
|
+
action="store_true",
|
707
|
+
help="If set, skip warmup.",
|
708
|
+
)
|
709
|
+
parser.add_argument(
|
710
|
+
"--warmups",
|
711
|
+
type=str,
|
712
|
+
required=False,
|
713
|
+
help="Specify custom warmup functions (csv) to run before server starts eg. --warmups=warmup_name1,warmup_name2 "
|
714
|
+
"will run the functions `warmup_name1` and `warmup_name2` specified in warmup.py before the server starts listening for requests",
|
715
|
+
)
|
716
|
+
parser.add_argument(
|
717
|
+
"--nccl-port",
|
718
|
+
type=int,
|
719
|
+
default=ServerArgs.nccl_port,
|
720
|
+
help="The port for NCCL distributed environment setup. Defaults to a random port.",
|
721
|
+
)
|
722
|
+
|
723
|
+
# Quantization and data type
|
671
724
|
parser.add_argument(
|
672
725
|
"--dtype",
|
673
726
|
type=str,
|
@@ -682,13 +735,6 @@ class ServerArgs:
|
|
682
735
|
'* "float" is shorthand for FP32 precision.\n'
|
683
736
|
'* "float32" for FP32 precision.',
|
684
737
|
)
|
685
|
-
parser.add_argument(
|
686
|
-
"--kv-cache-dtype",
|
687
|
-
type=str,
|
688
|
-
default=ServerArgs.kv_cache_dtype,
|
689
|
-
choices=["auto", "fp8_e5m2", "fp8_e4m3"],
|
690
|
-
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
|
691
|
-
)
|
692
738
|
parser.add_argument(
|
693
739
|
"--quantization",
|
694
740
|
type=str,
|
@@ -704,6 +750,7 @@ class ServerArgs:
|
|
704
750
|
"gguf",
|
705
751
|
"modelopt",
|
706
752
|
"modelopt_fp4",
|
753
|
+
"petit_nvfp4",
|
707
754
|
"w8a8_int8",
|
708
755
|
"w8a8_fp8",
|
709
756
|
"moe_wna16",
|
@@ -722,65 +769,11 @@ class ServerArgs:
|
|
722
769
|
"default to 1.0, which may cause accuracy issues. ",
|
723
770
|
)
|
724
771
|
parser.add_argument(
|
725
|
-
"--
|
726
|
-
type=int,
|
727
|
-
default=ServerArgs.context_length,
|
728
|
-
help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
|
729
|
-
)
|
730
|
-
parser.add_argument(
|
731
|
-
"--device",
|
732
|
-
type=str,
|
733
|
-
default=ServerArgs.device,
|
734
|
-
help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
|
735
|
-
)
|
736
|
-
parser.add_argument(
|
737
|
-
"--served-model-name",
|
738
|
-
type=str,
|
739
|
-
default=ServerArgs.served_model_name,
|
740
|
-
help="Override the model name returned by the v1/models endpoint in OpenAI API server.",
|
741
|
-
)
|
742
|
-
parser.add_argument(
|
743
|
-
"--chat-template",
|
744
|
-
type=str,
|
745
|
-
default=ServerArgs.chat_template,
|
746
|
-
help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
|
747
|
-
)
|
748
|
-
parser.add_argument(
|
749
|
-
"--completion-template",
|
750
|
-
type=str,
|
751
|
-
default=ServerArgs.completion_template,
|
752
|
-
help="The buliltin completion template name or the path of the completion template file. This is only used for OpenAI-compatible API server. only for code completion currently.",
|
753
|
-
)
|
754
|
-
parser.add_argument(
|
755
|
-
"--is-embedding",
|
756
|
-
action="store_true",
|
757
|
-
help="Whether to use a CausalLM as an embedding model.",
|
758
|
-
)
|
759
|
-
parser.add_argument(
|
760
|
-
"--enable-multimodal",
|
761
|
-
default=ServerArgs.enable_multimodal,
|
762
|
-
action="store_true",
|
763
|
-
help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
|
764
|
-
)
|
765
|
-
parser.add_argument(
|
766
|
-
"--revision",
|
767
|
-
type=str,
|
768
|
-
default=None,
|
769
|
-
help="The specific model version to use. It can be a branch "
|
770
|
-
"name, a tag name, or a commit id. If unspecified, will use "
|
771
|
-
"the default version.",
|
772
|
-
)
|
773
|
-
parser.add_argument(
|
774
|
-
"--impl",
|
772
|
+
"--kv-cache-dtype",
|
775
773
|
type=str,
|
776
|
-
default=ServerArgs.
|
777
|
-
|
778
|
-
'
|
779
|
-
"and fall back to the Transformers implementation if no SGLang "
|
780
|
-
"implementation is available.\n"
|
781
|
-
'* "sglang" will use the SGLang model implementation.\n'
|
782
|
-
'* "transformers" will use the Transformers model '
|
783
|
-
"implementation.\n",
|
774
|
+
default=ServerArgs.kv_cache_dtype,
|
775
|
+
choices=["auto", "fp8_e5m2", "fp8_e4m3"],
|
776
|
+
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
|
784
777
|
)
|
785
778
|
|
786
779
|
# Memory and scheduling
|
@@ -852,8 +845,26 @@ class ServerArgs:
|
|
852
845
|
"(1.0 = pure hybrid: swa_size / full_size = local_attention_size / context_length)"
|
853
846
|
),
|
854
847
|
)
|
848
|
+
parser.add_argument(
|
849
|
+
"--swa-full-tokens-ratio",
|
850
|
+
type=float,
|
851
|
+
default=ServerArgs.swa_full_tokens_ratio,
|
852
|
+
help="The ratio of SWA layer KV tokens / full layer KV tokens, regardless of the number of swa:full layers. It should be between 0 and 1. "
|
853
|
+
"E.g. 0.5 means if each swa layer has 50 tokens, then each full layer has 100 tokens.",
|
854
|
+
)
|
855
|
+
parser.add_argument(
|
856
|
+
"--disable-hybrid-swa-memory",
|
857
|
+
action="store_true",
|
858
|
+
help="Disable the hybrid SWA memory.",
|
859
|
+
)
|
855
860
|
|
856
|
-
#
|
861
|
+
# Runtime options
|
862
|
+
parser.add_argument(
|
863
|
+
"--device",
|
864
|
+
type=str,
|
865
|
+
default=ServerArgs.device,
|
866
|
+
help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
|
867
|
+
)
|
857
868
|
parser.add_argument(
|
858
869
|
"--tensor-parallel-size",
|
859
870
|
"--tp-size",
|
@@ -895,7 +906,7 @@ class ServerArgs:
|
|
895
906
|
"--constrained-json-whitespace-pattern",
|
896
907
|
type=str,
|
897
908
|
default=ServerArgs.constrained_json_whitespace_pattern,
|
898
|
-
help=
|
909
|
+
help="(outlines backend only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
|
899
910
|
)
|
900
911
|
parser.add_argument(
|
901
912
|
"--watchdog-timeout",
|
@@ -974,6 +985,13 @@ class ServerArgs:
|
|
974
985
|
action="store_true",
|
975
986
|
help="Enable log prometheus metrics.",
|
976
987
|
)
|
988
|
+
parser.add_argument(
|
989
|
+
"--enable-metrics-for-all-schedulers",
|
990
|
+
action="store_true",
|
991
|
+
help="Enable --enable-metrics-for-all-schedulers when you want schedulers on all TP ranks (not just TP 0) "
|
992
|
+
"to record request metrics separately. This is especially useful when dp_attention is enabled, as "
|
993
|
+
"otherwise all metrics appear to come from TP 0.",
|
994
|
+
)
|
977
995
|
parser.add_argument(
|
978
996
|
"--bucket-time-to-first-token",
|
979
997
|
type=float,
|
@@ -1001,12 +1019,6 @@ class ServerArgs:
|
|
1001
1019
|
default=ServerArgs.collect_tokens_histogram,
|
1002
1020
|
help="Collect prompt/generation tokens histogram.",
|
1003
1021
|
)
|
1004
|
-
parser.add_argument(
|
1005
|
-
"--kv-events-config",
|
1006
|
-
type=str,
|
1007
|
-
default=None,
|
1008
|
-
help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
|
1009
|
-
)
|
1010
1022
|
parser.add_argument(
|
1011
1023
|
"--decode-log-interval",
|
1012
1024
|
type=int,
|
@@ -1019,6 +1031,12 @@ class ServerArgs:
|
|
1019
1031
|
default=ServerArgs.enable_request_time_stats_logging,
|
1020
1032
|
help="Enable per request time stats logging",
|
1021
1033
|
)
|
1034
|
+
parser.add_argument(
|
1035
|
+
"--kv-events-config",
|
1036
|
+
type=str,
|
1037
|
+
default=None,
|
1038
|
+
help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
|
1039
|
+
)
|
1022
1040
|
|
1023
1041
|
# API related
|
1024
1042
|
parser.add_argument(
|
@@ -1027,6 +1045,24 @@ class ServerArgs:
|
|
1027
1045
|
default=ServerArgs.api_key,
|
1028
1046
|
help="Set API key of the server. It is also used in the OpenAI API compatible server.",
|
1029
1047
|
)
|
1048
|
+
parser.add_argument(
|
1049
|
+
"--served-model-name",
|
1050
|
+
type=str,
|
1051
|
+
default=ServerArgs.served_model_name,
|
1052
|
+
help="Override the model name returned by the v1/models endpoint in OpenAI API server.",
|
1053
|
+
)
|
1054
|
+
parser.add_argument(
|
1055
|
+
"--chat-template",
|
1056
|
+
type=str,
|
1057
|
+
default=ServerArgs.chat_template,
|
1058
|
+
help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
|
1059
|
+
)
|
1060
|
+
parser.add_argument(
|
1061
|
+
"--completion-template",
|
1062
|
+
type=str,
|
1063
|
+
default=ServerArgs.completion_template,
|
1064
|
+
help="The buliltin completion template name or the path of the completion template file. This is only used for OpenAI-compatible API server. only for code completion currently.",
|
1065
|
+
)
|
1030
1066
|
parser.add_argument(
|
1031
1067
|
"--file-storage-path",
|
1032
1068
|
type=str,
|
@@ -1055,9 +1091,10 @@ class ServerArgs:
|
|
1055
1091
|
"deepseekv3",
|
1056
1092
|
"pythonic",
|
1057
1093
|
"kimi_k2",
|
1094
|
+
"qwen3_coder",
|
1058
1095
|
],
|
1059
1096
|
default=ServerArgs.tool_call_parser,
|
1060
|
-
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', and '
|
1097
|
+
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', and 'qwen3_coder'.",
|
1061
1098
|
)
|
1062
1099
|
|
1063
1100
|
# Data parallelism
|
@@ -1107,6 +1144,28 @@ class ServerArgs:
|
|
1107
1144
|
)
|
1108
1145
|
|
1109
1146
|
# LoRA
|
1147
|
+
parser.add_argument(
|
1148
|
+
"--enable-lora",
|
1149
|
+
default=ServerArgs.enable_lora,
|
1150
|
+
action="store_true",
|
1151
|
+
help="Enable LoRA support for the model. This argument is automatically set to True if `--lora-paths` is provided for backward compatibility.",
|
1152
|
+
)
|
1153
|
+
parser.add_argument(
|
1154
|
+
"--max-lora-rank",
|
1155
|
+
default=ServerArgs.max_lora_rank,
|
1156
|
+
type=int,
|
1157
|
+
help="The maximum rank of LoRA adapters. If not specified, it will be automatically inferred from the adapters provided in --lora-paths.",
|
1158
|
+
)
|
1159
|
+
parser.add_argument(
|
1160
|
+
"--lora-target-modules",
|
1161
|
+
type=str,
|
1162
|
+
choices=SUPPORTED_LORA_TARGET_MODULES + [LORA_TARGET_ALL_MODULES],
|
1163
|
+
nargs="*",
|
1164
|
+
default=None,
|
1165
|
+
help="The union set of all target modules where LoRA should be applied. If not specified, "
|
1166
|
+
"it will be automatically inferred from the adapters provided in --lora-paths. If 'all' is specified, "
|
1167
|
+
"all supported modules will be targeted.",
|
1168
|
+
)
|
1110
1169
|
parser.add_argument(
|
1111
1170
|
"--lora-paths",
|
1112
1171
|
type=str,
|
@@ -1160,6 +1219,13 @@ class ServerArgs:
|
|
1160
1219
|
default=ServerArgs.grammar_backend,
|
1161
1220
|
help="Choose the backend for grammar-guided decoding.",
|
1162
1221
|
)
|
1222
|
+
parser.add_argument(
|
1223
|
+
"--mm-attention-backend",
|
1224
|
+
type=str,
|
1225
|
+
choices=["sdpa", "fa3", "triton_attn"],
|
1226
|
+
default=ServerArgs.mm_attention_backend,
|
1227
|
+
help="Set multimodal attention backend.",
|
1228
|
+
)
|
1163
1229
|
|
1164
1230
|
# Speculative decoding
|
1165
1231
|
parser.add_argument(
|
@@ -1209,13 +1275,6 @@ class ServerArgs:
|
|
1209
1275
|
help="The path of the draft model's small vocab table.",
|
1210
1276
|
default=ServerArgs.speculative_token_map,
|
1211
1277
|
)
|
1212
|
-
parser.add_argument(
|
1213
|
-
"--mm-attention-backend",
|
1214
|
-
type=str,
|
1215
|
-
choices=["sdpa", "fa3", "triton_attn"],
|
1216
|
-
default=ServerArgs.mm_attention_backend,
|
1217
|
-
help="Set multimodal attention backend.",
|
1218
|
-
)
|
1219
1278
|
|
1220
1279
|
# Expert parallelism
|
1221
1280
|
parser.add_argument(
|
@@ -1323,6 +1382,46 @@ class ServerArgs:
|
|
1323
1382
|
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
1324
1383
|
)
|
1325
1384
|
|
1385
|
+
# Hierarchical cache
|
1386
|
+
parser.add_argument(
|
1387
|
+
"--enable-hierarchical-cache",
|
1388
|
+
action="store_true",
|
1389
|
+
help="Enable hierarchical cache",
|
1390
|
+
)
|
1391
|
+
parser.add_argument(
|
1392
|
+
"--hicache-ratio",
|
1393
|
+
type=float,
|
1394
|
+
default=ServerArgs.hicache_ratio,
|
1395
|
+
help="The ratio of the size of host KV cache memory pool to the size of device pool.",
|
1396
|
+
)
|
1397
|
+
parser.add_argument(
|
1398
|
+
"--hicache-size",
|
1399
|
+
type=int,
|
1400
|
+
default=ServerArgs.hicache_size,
|
1401
|
+
help="The size of host KV cache memory pool in gigabytes, which will override the hicache_ratio if set.",
|
1402
|
+
)
|
1403
|
+
parser.add_argument(
|
1404
|
+
"--hicache-write-policy",
|
1405
|
+
type=str,
|
1406
|
+
choices=["write_back", "write_through", "write_through_selective"],
|
1407
|
+
default=ServerArgs.hicache_write_policy,
|
1408
|
+
help="The write policy of hierarchical cache.",
|
1409
|
+
)
|
1410
|
+
parser.add_argument(
|
1411
|
+
"--hicache-io-backend",
|
1412
|
+
type=str,
|
1413
|
+
choices=["direct", "kernel"],
|
1414
|
+
default=ServerArgs.hicache_io_backend,
|
1415
|
+
help="The IO backend for KV cache transfer between CPU and GPU",
|
1416
|
+
)
|
1417
|
+
parser.add_argument(
|
1418
|
+
"--hicache-storage-backend",
|
1419
|
+
type=str,
|
1420
|
+
choices=["file"], # todo, mooncake
|
1421
|
+
default=ServerArgs.hicache_storage_backend,
|
1422
|
+
help="The storage backend for hierarchical KV cache.",
|
1423
|
+
)
|
1424
|
+
|
1326
1425
|
# Double Sparsity
|
1327
1426
|
parser.add_argument(
|
1328
1427
|
"--enable-double-sparsity",
|
@@ -1423,11 +1522,6 @@ class ServerArgs:
|
|
1423
1522
|
action="store_true",
|
1424
1523
|
help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
|
1425
1524
|
)
|
1426
|
-
parser.add_argument(
|
1427
|
-
"--disable-overlap-cg-plan",
|
1428
|
-
action="store_true",
|
1429
|
-
help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
|
1430
|
-
)
|
1431
1525
|
parser.add_argument(
|
1432
1526
|
"--enable-mixed-chunk",
|
1433
1527
|
action="store_true",
|
@@ -1515,37 +1609,6 @@ class ServerArgs:
|
|
1515
1609
|
action="store_true",
|
1516
1610
|
help="Enable users to pass custom logit processors to the server (disabled by default for security)",
|
1517
1611
|
)
|
1518
|
-
parser.add_argument(
|
1519
|
-
"--enable-hierarchical-cache",
|
1520
|
-
action="store_true",
|
1521
|
-
help="Enable hierarchical cache",
|
1522
|
-
)
|
1523
|
-
parser.add_argument(
|
1524
|
-
"--hicache-ratio",
|
1525
|
-
type=float,
|
1526
|
-
default=ServerArgs.hicache_ratio,
|
1527
|
-
help="The ratio of the size of host KV cache memory pool to the size of device pool.",
|
1528
|
-
)
|
1529
|
-
parser.add_argument(
|
1530
|
-
"--hicache-size",
|
1531
|
-
type=int,
|
1532
|
-
default=ServerArgs.hicache_size,
|
1533
|
-
help="The size of host KV cache memory pool in gigabytes, which will override the hicache_ratio if set.",
|
1534
|
-
)
|
1535
|
-
parser.add_argument(
|
1536
|
-
"--hicache-write-policy",
|
1537
|
-
type=str,
|
1538
|
-
choices=["write_back", "write_through", "write_through_selective"],
|
1539
|
-
default=ServerArgs.hicache_write_policy,
|
1540
|
-
help="The write policy of hierarchical cache.",
|
1541
|
-
)
|
1542
|
-
parser.add_argument(
|
1543
|
-
"--hicache-io-backend",
|
1544
|
-
type=str,
|
1545
|
-
choices=["direct", "kernel"],
|
1546
|
-
default=ServerArgs.hicache_io_backend,
|
1547
|
-
help="The IO backend for KV cache transfer between CPU and GPU",
|
1548
|
-
)
|
1549
1612
|
parser.add_argument(
|
1550
1613
|
"--flashinfer-mla-disable-ragged",
|
1551
1614
|
action="store_true",
|
@@ -1576,13 +1639,6 @@ class ServerArgs:
|
|
1576
1639
|
action="store_true",
|
1577
1640
|
help="Use triton moe grouped gemm kernel.",
|
1578
1641
|
)
|
1579
|
-
parser.add_argument(
|
1580
|
-
"--warmups",
|
1581
|
-
type=str,
|
1582
|
-
required=False,
|
1583
|
-
help="Specify custom warmup functions (csv) to run before server starts eg. --warmups=warmup_name1,warmup_name2 "
|
1584
|
-
"will run the functions `warmup_name1` and `warmup_name2` specified in warmup.py before the server starts listening for requests",
|
1585
|
-
)
|
1586
1642
|
|
1587
1643
|
# Debug tensor dumps
|
1588
1644
|
parser.add_argument(
|
@@ -1609,7 +1665,7 @@ class ServerArgs:
|
|
1609
1665
|
help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
|
1610
1666
|
)
|
1611
1667
|
|
1612
|
-
#
|
1668
|
+
# PD disaggregation
|
1613
1669
|
parser.add_argument(
|
1614
1670
|
"--disaggregation-mode",
|
1615
1671
|
type=str,
|
@@ -1668,6 +1724,8 @@ class ServerArgs:
|
|
1668
1724
|
default=None,
|
1669
1725
|
help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
|
1670
1726
|
)
|
1727
|
+
|
1728
|
+
# Custom weight loader
|
1671
1729
|
parser.add_argument(
|
1672
1730
|
"--custom-weight-loader",
|
1673
1731
|
type=str,
|
@@ -1675,6 +1733,19 @@ class ServerArgs:
|
|
1675
1733
|
default=None,
|
1676
1734
|
help="The custom dataloader which used to update the model. Should be set with a valid import path, such as my_package.weight_load_func",
|
1677
1735
|
)
|
1736
|
+
parser.add_argument(
|
1737
|
+
"--enable-pdmux",
|
1738
|
+
action="store_true",
|
1739
|
+
help="Enable PD-Multiplexing, PD running on greenctx stream.",
|
1740
|
+
)
|
1741
|
+
|
1742
|
+
# For PD-Multiplexing
|
1743
|
+
parser.add_argument(
|
1744
|
+
"--sm-group-num",
|
1745
|
+
type=int,
|
1746
|
+
default=ServerArgs.sm_group_num,
|
1747
|
+
help="Number of sm partition groups.",
|
1748
|
+
)
|
1678
1749
|
parser.add_argument(
|
1679
1750
|
"--weight-loader-disable-mmap",
|
1680
1751
|
action="store_true",
|
@@ -1696,12 +1767,23 @@ class ServerArgs:
|
|
1696
1767
|
else:
|
1697
1768
|
return f"http://{self.host}:{self.port}"
|
1698
1769
|
|
1770
|
+
def get_hf_config(self):
|
1771
|
+
kwargs = {}
|
1772
|
+
hf_config = get_config(
|
1773
|
+
self.model_path,
|
1774
|
+
trust_remote_code=self.trust_remote_code,
|
1775
|
+
revision=self.revision,
|
1776
|
+
model_override_args=json.loads(self.json_model_override_args),
|
1777
|
+
**kwargs,
|
1778
|
+
)
|
1779
|
+
return hf_config
|
1780
|
+
|
1699
1781
|
def check_server_args(self):
|
1782
|
+
# Check parallel size constraints
|
1700
1783
|
assert (
|
1701
1784
|
self.tp_size * self.pp_size
|
1702
1785
|
) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
|
1703
1786
|
|
1704
|
-
# FIXME pp constraints
|
1705
1787
|
if self.pp_size > 1:
|
1706
1788
|
assert (
|
1707
1789
|
self.disable_overlap_schedule
|
@@ -1712,23 +1794,143 @@ class ServerArgs:
|
|
1712
1794
|
assert not (
|
1713
1795
|
self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
|
1714
1796
|
), "multi-node data parallel is not supported unless dp attention!"
|
1797
|
+
|
1798
|
+
assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
|
1799
|
+
assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
|
1800
|
+
|
1801
|
+
assert self.moe_dense_tp_size in {
|
1802
|
+
1,
|
1803
|
+
None,
|
1804
|
+
}, "moe_dense_tp_size only support 1 and None currently"
|
1805
|
+
|
1806
|
+
# Check model architecture
|
1807
|
+
model_arch = self.get_hf_config().architectures[0]
|
1808
|
+
if "Llama4" in model_arch:
|
1809
|
+
assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
|
1810
|
+
|
1811
|
+
# Check LoRA
|
1812
|
+
self.check_lora_server_args()
|
1813
|
+
|
1814
|
+
# Check speculative decoding
|
1815
|
+
if self.speculative_algorithm is not None:
|
1816
|
+
assert (
|
1817
|
+
not self.enable_mixed_chunk
|
1818
|
+
), "enable_mixed_chunk is required for speculative decoding"
|
1819
|
+
|
1820
|
+
# Check chunked prefill
|
1821
|
+
assert (
|
1822
|
+
self.chunked_prefill_size % self.page_size == 0
|
1823
|
+
), "chunked_prefill_size must be divisible by page_size"
|
1824
|
+
|
1825
|
+
def check_lora_server_args(self):
|
1715
1826
|
assert (
|
1716
1827
|
self.max_loras_per_batch > 0
|
1717
1828
|
# FIXME
|
1718
1829
|
and (self.lora_paths is None or self.disable_radix_cache)
|
1719
1830
|
), "compatibility of lora and radix attention is in progress"
|
1720
|
-
assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
|
1721
|
-
assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
|
1722
1831
|
|
1723
|
-
if
|
1724
|
-
|
1725
|
-
self.
|
1726
|
-
|
1727
|
-
|
1728
|
-
|
1729
|
-
|
1730
|
-
|
1731
|
-
|
1832
|
+
# Enable LoRA if any LoRA paths are provided for backward compatibility.
|
1833
|
+
if self.lora_paths:
|
1834
|
+
if self.enable_lora is None:
|
1835
|
+
self.enable_lora = True
|
1836
|
+
logger.info(
|
1837
|
+
"--enable-lora is set to True because --lora-paths is provided."
|
1838
|
+
)
|
1839
|
+
elif self.enable_lora is False:
|
1840
|
+
logger.warning(
|
1841
|
+
"--enable-lora is set to False, any provided lora_paths will be ignored."
|
1842
|
+
)
|
1843
|
+
|
1844
|
+
if self.enable_lora:
|
1845
|
+
# Normalize lora_paths to a dictionary if it is a list.
|
1846
|
+
if isinstance(self.lora_paths, list):
|
1847
|
+
lora_paths = self.lora_paths
|
1848
|
+
self.lora_paths = {}
|
1849
|
+
for lora_path in lora_paths:
|
1850
|
+
if "=" in lora_path:
|
1851
|
+
name, path = lora_path.split("=", 1)
|
1852
|
+
self.lora_paths[name] = LoRARef(lora_name=name, lora_path=path)
|
1853
|
+
else:
|
1854
|
+
self.lora_paths[lora_path] = LoRARef(
|
1855
|
+
lora_name=lora_path,
|
1856
|
+
lora_path=lora_path,
|
1857
|
+
)
|
1858
|
+
elif isinstance(self.lora_paths, dict):
|
1859
|
+
self.lora_paths = {
|
1860
|
+
k: LoRARef(lora_name=k, lora_path=v)
|
1861
|
+
for k, v in self.lora_paths.items()
|
1862
|
+
}
|
1863
|
+
elif self.lora_paths is None:
|
1864
|
+
self.lora_paths = {}
|
1865
|
+
else:
|
1866
|
+
raise ValueError(
|
1867
|
+
f"Invalid type for --lora-paths: {type(self.lora_paths)}. "
|
1868
|
+
"Expected a list or a dictionary."
|
1869
|
+
)
|
1870
|
+
|
1871
|
+
# Expand target modules
|
1872
|
+
if self.lora_target_modules:
|
1873
|
+
self.lora_target_modules = set(self.lora_target_modules)
|
1874
|
+
if "all" in self.lora_target_modules:
|
1875
|
+
assert (
|
1876
|
+
len(self.lora_target_modules) == 1
|
1877
|
+
), "If 'all' is specified in --lora-target-modules, it should be the only module specified."
|
1878
|
+
self.lora_target_modules = set(SUPPORTED_LORA_TARGET_MODULES)
|
1879
|
+
|
1880
|
+
# Ensure sufficient information is provided for LoRA initialization.
|
1881
|
+
assert self.lora_paths or (
|
1882
|
+
self.max_lora_rank and self.lora_target_modules
|
1883
|
+
), "When no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization."
|
1884
|
+
|
1885
|
+
def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
|
1886
|
+
larger_tp = max(decode_tp, prefill_tp)
|
1887
|
+
smaller_tp = min(decode_tp, prefill_tp)
|
1888
|
+
assert larger_tp % smaller_tp == 0, (
|
1889
|
+
"Different tp size is supported only when one tp is multiple of the other. "
|
1890
|
+
f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
|
1891
|
+
)
|
1892
|
+
|
1893
|
+
def adjust_mem_fraction_for_vlm(self, model_config):
|
1894
|
+
vision_config = getattr(model_config.hf_config, "vision_config", None)
|
1895
|
+
if vision_config is None:
|
1896
|
+
return
|
1897
|
+
|
1898
|
+
# roughly reduce the mem_fraction_static base on params of Vit
|
1899
|
+
original_server_arg_mem_fraction = self.mem_fraction_static
|
1900
|
+
# a base mem_fraction_static factor for regular Vit
|
1901
|
+
base_mem_fraction_reduction_ratio = 0.95
|
1902
|
+
|
1903
|
+
vit_num_layers = getattr(vision_config, "num_hidden_layers", 24)
|
1904
|
+
vit_hidden_size = getattr(vision_config, "hidden_size", 1024)
|
1905
|
+
|
1906
|
+
# baseline ViT params (ViT-L/14)
|
1907
|
+
baseline_vit_layers = 24
|
1908
|
+
baseline_vit_hidden_size = 1024
|
1909
|
+
|
1910
|
+
# weight params count
|
1911
|
+
current_complexity_score = vit_num_layers * (vit_hidden_size**2)
|
1912
|
+
baseline_complexity_score = baseline_vit_layers * (baseline_vit_hidden_size**2)
|
1913
|
+
complexity_ratio = (
|
1914
|
+
current_complexity_score / baseline_complexity_score
|
1915
|
+
if baseline_complexity_score > 0
|
1916
|
+
else 1.0
|
1917
|
+
)
|
1918
|
+
|
1919
|
+
# every time the complexity grows 100%, adjust final factor for 10%
|
1920
|
+
sensitivity_scale = 0.1
|
1921
|
+
dynamic_adjustment_factor = 1.0 - sensitivity_scale * (complexity_ratio - 1.0)
|
1922
|
+
dynamic_adjustment_factor = max(0.8, min(1.05, dynamic_adjustment_factor))
|
1923
|
+
|
1924
|
+
final_overall_factor = (
|
1925
|
+
base_mem_fraction_reduction_ratio * dynamic_adjustment_factor
|
1926
|
+
)
|
1927
|
+
self.mem_fraction_static = (
|
1928
|
+
original_server_arg_mem_fraction * final_overall_factor
|
1929
|
+
)
|
1930
|
+
logger.warning(
|
1931
|
+
f"Multimodal model: Dynamically adjusted --mem-fraction-static "
|
1932
|
+
f"from: {original_server_arg_mem_fraction:.3f} to: {self.mem_fraction_static:.3f}."
|
1933
|
+
)
|
1732
1934
|
|
1733
1935
|
|
1734
1936
|
def prepare_server_args(argv: List[str]) -> ServerArgs:
|
@@ -1773,16 +1975,16 @@ class PortArgs:
|
|
1773
1975
|
@staticmethod
|
1774
1976
|
def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
|
1775
1977
|
if server_args.nccl_port is None:
|
1776
|
-
|
1978
|
+
nccl_port = server_args.port + random.randint(100, 1000)
|
1777
1979
|
while True:
|
1778
|
-
if is_port_available(
|
1980
|
+
if is_port_available(nccl_port):
|
1779
1981
|
break
|
1780
|
-
if
|
1781
|
-
|
1982
|
+
if nccl_port < 60000:
|
1983
|
+
nccl_port += 42
|
1782
1984
|
else:
|
1783
|
-
|
1985
|
+
nccl_port -= 43
|
1784
1986
|
else:
|
1785
|
-
|
1987
|
+
nccl_port = server_args.nccl_port
|
1786
1988
|
|
1787
1989
|
if not server_args.enable_dp_attention:
|
1788
1990
|
# Normal case, use IPC within a single node
|
@@ -1790,7 +1992,7 @@ class PortArgs:
|
|
1790
1992
|
tokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
|
1791
1993
|
scheduler_input_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
|
1792
1994
|
detokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
|
1793
|
-
nccl_port=
|
1995
|
+
nccl_port=nccl_port,
|
1794
1996
|
rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
|
1795
1997
|
metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
|
1796
1998
|
)
|
@@ -1820,7 +2022,7 @@ class PortArgs:
|
|
1820
2022
|
tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
|
1821
2023
|
scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
|
1822
2024
|
detokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base + 1}",
|
1823
|
-
nccl_port=
|
2025
|
+
nccl_port=nccl_port,
|
1824
2026
|
rpc_ipc_name=f"tcp://{dist_init_host}:{port_base + 2}",
|
1825
2027
|
metrics_ipc_name=f"tcp://{dist_init_host}:{port_base + 3}",
|
1826
2028
|
)
|
@@ -1847,31 +2049,13 @@ class DeprecatedAction(argparse.Action):
|
|
1847
2049
|
raise ValueError(self.help)
|
1848
2050
|
|
1849
2051
|
|
1850
|
-
def get_model_arch(args: ServerArgs):
|
1851
|
-
hf_config = get_config(
|
1852
|
-
args.model_path,
|
1853
|
-
trust_remote_code=args.trust_remote_code,
|
1854
|
-
revision=args.revision,
|
1855
|
-
model_override_args=json.loads(args.json_model_override_args),
|
1856
|
-
)
|
1857
|
-
return hf_config.architectures[0]
|
1858
|
-
|
1859
|
-
|
1860
2052
|
def auto_choose_speculative_params(self: ServerArgs):
|
1861
2053
|
"""
|
1862
2054
|
Automatically choose the parameters for speculative decoding.
|
1863
2055
|
|
1864
2056
|
You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
|
1865
2057
|
"""
|
1866
|
-
|
1867
|
-
|
1868
|
-
hf_config = get_config(
|
1869
|
-
self.model_path,
|
1870
|
-
trust_remote_code=self.trust_remote_code,
|
1871
|
-
revision=self.revision,
|
1872
|
-
model_override_args=json.loads(self.json_model_override_args),
|
1873
|
-
**kwargs,
|
1874
|
-
)
|
2058
|
+
hf_config = self.get_hf_config()
|
1875
2059
|
arch = hf_config.architectures[0]
|
1876
2060
|
|
1877
2061
|
if arch in ["LlamaForCausalLM"]:
|