sglang 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +0 -6
- sglang/bench_one_batch_server.py +7 -2
- sglang/bench_serving.py +3 -3
- sglang/eval/llama3_eval.py +0 -1
- sglang/srt/configs/model_config.py +24 -9
- sglang/srt/configs/update_config.py +40 -5
- sglang/srt/constrained/xgrammar_backend.py +23 -11
- sglang/srt/conversation.py +2 -15
- sglang/srt/disaggregation/ascend/conn.py +1 -3
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +1 -1
- sglang/srt/disaggregation/launch_lb.py +7 -1
- sglang/srt/disaggregation/mini_lb.py +11 -5
- sglang/srt/disaggregation/mooncake/conn.py +141 -47
- sglang/srt/disaggregation/prefill.py +261 -5
- sglang/srt/disaggregation/utils.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
- sglang/srt/distributed/device_communicators/pynccl.py +68 -18
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
- sglang/srt/distributed/naive_distributed.py +112 -0
- sglang/srt/distributed/parallel_state.py +90 -4
- sglang/srt/entrypoints/context.py +20 -1
- sglang/srt/entrypoints/engine.py +27 -2
- sglang/srt/entrypoints/http_server.py +12 -0
- sglang/srt/entrypoints/openai/protocol.py +2 -2
- sglang/srt/entrypoints/openai/serving_chat.py +22 -6
- sglang/srt/entrypoints/openai/serving_completions.py +9 -1
- sglang/srt/entrypoints/openai/serving_responses.py +2 -2
- sglang/srt/eplb/expert_distribution.py +2 -3
- sglang/srt/function_call/deepseekv3_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +24 -0
- sglang/srt/host_shared_memory.py +83 -0
- sglang/srt/layers/attention/ascend_backend.py +132 -22
- sglang/srt/layers/attention/flashattention_backend.py +24 -17
- sglang/srt/layers/attention/flashinfer_backend.py +11 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +226 -76
- sglang/srt/layers/attention/triton_backend.py +85 -46
- sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
- sglang/srt/layers/attention/trtllm_mha_backend.py +390 -30
- sglang/srt/layers/attention/trtllm_mla_backend.py +39 -16
- sglang/srt/layers/attention/utils.py +94 -15
- sglang/srt/layers/attention/vision.py +40 -13
- sglang/srt/layers/attention/vision_utils.py +65 -0
- sglang/srt/layers/communicator.py +51 -3
- sglang/srt/layers/dp_attention.py +23 -4
- sglang/srt/layers/elementwise.py +94 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
- sglang/srt/layers/layernorm.py +8 -1
- sglang/srt/layers/linear.py +24 -0
- sglang/srt/layers/logits_processor.py +5 -1
- sglang/srt/layers/moe/__init__.py +31 -0
- sglang/srt/layers/moe/ep_moe/layer.py +37 -33
- sglang/srt/layers/moe/fused_moe_native.py +14 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
- sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
- sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
- sglang/srt/layers/moe/moe_runner/base.py +13 -0
- sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
- sglang/srt/layers/moe/router.py +15 -9
- sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
- sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
- sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
- sglang/srt/layers/moe/topk.py +167 -83
- sglang/srt/layers/moe/utils.py +159 -18
- sglang/srt/layers/quantization/__init__.py +13 -14
- sglang/srt/layers/quantization/awq.py +7 -7
- sglang/srt/layers/quantization/base_config.py +2 -6
- sglang/srt/layers/quantization/blockwise_int8.py +4 -12
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -28
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
- sglang/srt/layers/quantization/fp8.py +127 -119
- sglang/srt/layers/quantization/fp8_kernel.py +195 -24
- sglang/srt/layers/quantization/fp8_utils.py +34 -9
- sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
- sglang/srt/layers/quantization/gptq.py +5 -4
- sglang/srt/layers/quantization/marlin_utils.py +11 -3
- sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
- sglang/srt/layers/quantization/modelopt_quant.py +165 -68
- sglang/srt/layers/quantization/moe_wna16.py +10 -15
- sglang/srt/layers/quantization/mxfp4.py +206 -37
- sglang/srt/layers/quantization/quark/quark.py +390 -0
- sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
- sglang/srt/layers/quantization/unquant.py +34 -70
- sglang/srt/layers/quantization/utils.py +25 -0
- sglang/srt/layers/quantization/w4afp8.py +7 -8
- sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
- sglang/srt/layers/quantization/w8a8_int8.py +5 -13
- sglang/srt/layers/radix_attention.py +6 -0
- sglang/srt/layers/rotary_embedding.py +1 -0
- sglang/srt/lora/lora_manager.py +21 -22
- sglang/srt/lora/lora_registry.py +3 -3
- sglang/srt/lora/mem_pool.py +26 -24
- sglang/srt/lora/utils.py +10 -12
- sglang/srt/managers/cache_controller.py +76 -18
- sglang/srt/managers/detokenizer_manager.py +10 -2
- sglang/srt/managers/io_struct.py +9 -0
- sglang/srt/managers/mm_utils.py +1 -1
- sglang/srt/managers/schedule_batch.py +4 -9
- sglang/srt/managers/scheduler.py +25 -16
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/template_manager.py +7 -5
- sglang/srt/managers/tokenizer_manager.py +60 -21
- sglang/srt/managers/tp_worker.py +1 -0
- sglang/srt/managers/utils.py +59 -1
- sglang/srt/mem_cache/allocator.py +7 -5
- sglang/srt/mem_cache/allocator_ascend.py +0 -11
- sglang/srt/mem_cache/hicache_storage.py +14 -4
- sglang/srt/mem_cache/memory_pool.py +3 -3
- sglang/srt/mem_cache/memory_pool_host.py +35 -2
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
- sglang/srt/model_executor/cuda_graph_runner.py +25 -12
- sglang/srt/model_executor/forward_batch_info.py +4 -1
- sglang/srt/model_executor/model_runner.py +43 -32
- sglang/srt/model_executor/npu_graph_runner.py +94 -0
- sglang/srt/model_loader/loader.py +24 -6
- sglang/srt/models/dbrx.py +12 -6
- sglang/srt/models/deepseek.py +2 -1
- sglang/srt/models/deepseek_nextn.py +3 -1
- sglang/srt/models/deepseek_v2.py +224 -223
- sglang/srt/models/ernie4.py +2 -2
- sglang/srt/models/glm4_moe.py +25 -63
- sglang/srt/models/glm4v.py +52 -1
- sglang/srt/models/glm4v_moe.py +8 -11
- sglang/srt/models/gpt_oss.py +34 -74
- sglang/srt/models/granitemoe.py +0 -1
- sglang/srt/models/grok.py +376 -48
- sglang/srt/models/interns1.py +12 -47
- sglang/srt/models/internvl.py +6 -51
- sglang/srt/models/llama4.py +0 -2
- sglang/srt/models/minicpm3.py +0 -1
- sglang/srt/models/mixtral.py +0 -2
- sglang/srt/models/nemotron_nas.py +435 -0
- sglang/srt/models/olmoe.py +0 -1
- sglang/srt/models/phi4mm.py +3 -21
- sglang/srt/models/qwen2_5_vl.py +2 -0
- sglang/srt/models/qwen2_moe.py +3 -18
- sglang/srt/models/qwen3.py +2 -2
- sglang/srt/models/qwen3_classification.py +7 -1
- sglang/srt/models/qwen3_moe.py +9 -38
- sglang/srt/models/step3_vl.py +2 -1
- sglang/srt/models/xverse_moe.py +11 -5
- sglang/srt/multimodal/processors/base_processor.py +3 -3
- sglang/srt/multimodal/processors/internvl.py +7 -2
- sglang/srt/multimodal/processors/llava.py +11 -7
- sglang/srt/offloader.py +433 -0
- sglang/srt/operations.py +6 -1
- sglang/srt/reasoning_parser.py +4 -3
- sglang/srt/server_args.py +237 -104
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
- sglang/srt/speculative/eagle_utils.py +36 -13
- sglang/srt/speculative/eagle_worker.py +56 -3
- sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
- sglang/srt/two_batch_overlap.py +16 -11
- sglang/srt/utils.py +68 -70
- sglang/test/runners.py +8 -5
- sglang/test/test_block_fp8.py +5 -6
- sglang/test/test_block_fp8_ep.py +13 -19
- sglang/test/test_cutlass_moe.py +4 -6
- sglang/test/test_cutlass_w4a8_moe.py +4 -3
- sglang/test/test_fp4_moe.py +4 -3
- sglang/test/test_utils.py +7 -0
- sglang/utils.py +0 -1
- sglang/version.py +1 -1
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/METADATA +7 -7
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/RECORD +179 -161
- sglang/srt/layers/quantization/fp4.py +0 -557
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -23,6 +23,7 @@ import sys
|
|
23
23
|
import tempfile
|
24
24
|
from typing import List, Literal, Optional, Union
|
25
25
|
|
26
|
+
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
26
27
|
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
27
28
|
from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported
|
28
29
|
from sglang.srt.lora.lora_registry import LoRARef
|
@@ -33,10 +34,12 @@ from sglang.srt.utils import (
|
|
33
34
|
configure_ipv6,
|
34
35
|
get_device,
|
35
36
|
get_device_memory_capacity,
|
37
|
+
is_cuda,
|
36
38
|
is_flashinfer_available,
|
37
39
|
is_hip,
|
38
40
|
is_port_available,
|
39
41
|
is_remote_url,
|
42
|
+
is_triton_kernels_available,
|
40
43
|
is_valid_ipv6_address,
|
41
44
|
nullable_str,
|
42
45
|
)
|
@@ -82,7 +85,6 @@ class ServerArgs:
|
|
82
85
|
max_prefill_tokens: int = 16384
|
83
86
|
schedule_policy: str = "fcfs"
|
84
87
|
schedule_conservativeness: float = 1.0
|
85
|
-
cpu_offload_gb: int = 0
|
86
88
|
page_size: Optional[int] = None
|
87
89
|
hybrid_kvcache_ratio: Optional[float] = None
|
88
90
|
swa_full_tokens_ratio: float = 0.8
|
@@ -120,6 +122,7 @@ class ServerArgs:
|
|
120
122
|
decode_log_interval: int = 40
|
121
123
|
enable_request_time_stats_logging: bool = False
|
122
124
|
kv_events_config: Optional[str] = None
|
125
|
+
gc_warning_threshold_secs: float = 0.0
|
123
126
|
|
124
127
|
# API related
|
125
128
|
api_key: Optional[str] = None
|
@@ -150,7 +153,9 @@ class ServerArgs:
|
|
150
153
|
enable_lora: Optional[bool] = None
|
151
154
|
max_lora_rank: Optional[int] = None
|
152
155
|
lora_target_modules: Optional[Union[set[str], List[str]]] = None
|
153
|
-
lora_paths: Optional[
|
156
|
+
lora_paths: Optional[
|
157
|
+
Union[dict[str, str], List[dict[str, str]], List[str], List[LoRARef]]
|
158
|
+
] = None
|
154
159
|
max_loaded_loras: Optional[int] = None
|
155
160
|
max_loras_per_batch: int = 8
|
156
161
|
lora_backend: str = "triton"
|
@@ -175,9 +180,16 @@ class ServerArgs:
|
|
175
180
|
|
176
181
|
# Expert parallelism
|
177
182
|
ep_size: int = 1
|
178
|
-
moe_a2a_backend:
|
179
|
-
|
180
|
-
|
183
|
+
moe_a2a_backend: Literal["none", "deepep"] = "none"
|
184
|
+
moe_runner_backend: Literal[
|
185
|
+
"auto",
|
186
|
+
"triton",
|
187
|
+
"triton_kernel",
|
188
|
+
"flashinfer_trtllm",
|
189
|
+
"flashinfer_cutlass",
|
190
|
+
"flashinfer_mxfp4",
|
191
|
+
] = "auto"
|
192
|
+
flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
|
181
193
|
enable_flashinfer_allreduce_fusion: bool = False
|
182
194
|
deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
|
183
195
|
ep_num_redundant_experts: int = 0
|
@@ -213,6 +225,13 @@ class ServerArgs:
|
|
213
225
|
ds_heavy_channel_type: str = "qk"
|
214
226
|
ds_sparse_decode_threshold: int = 4096
|
215
227
|
|
228
|
+
# Offloading
|
229
|
+
cpu_offload_gb: int = 0
|
230
|
+
offload_group_size: int = -1
|
231
|
+
offload_num_in_group: int = 1
|
232
|
+
offload_prefetch_step: int = 1
|
233
|
+
offload_mode: str = "cpu"
|
234
|
+
|
216
235
|
# Optimization/debug options
|
217
236
|
disable_radix_cache: bool = False
|
218
237
|
cuda_graph_max_bs: Optional[int] = None
|
@@ -223,6 +242,7 @@ class ServerArgs:
|
|
223
242
|
enable_cudagraph_gc: bool = False
|
224
243
|
enable_nccl_nvls: bool = False
|
225
244
|
enable_symm_mem: bool = False
|
245
|
+
disable_flashinfer_cutlass_moe_fp4_allgather: bool = False
|
226
246
|
enable_tokenizer_batch_encode: bool = False
|
227
247
|
disable_outlines_disk_cache: bool = False
|
228
248
|
disable_custom_all_reduce: bool = False
|
@@ -250,8 +270,6 @@ class ServerArgs:
|
|
250
270
|
disable_chunked_prefix_cache: bool = False
|
251
271
|
disable_fast_image_processor: bool = False
|
252
272
|
enable_return_hidden_states: bool = False
|
253
|
-
enable_triton_kernel_moe: bool = False
|
254
|
-
enable_flashinfer_mxfp4_moe: bool = False
|
255
273
|
scheduler_recv_interval: int = 1
|
256
274
|
|
257
275
|
# Debug tensor dumps
|
@@ -282,12 +300,13 @@ class ServerArgs:
|
|
282
300
|
# Deprecated arguments
|
283
301
|
enable_ep_moe: bool = False
|
284
302
|
enable_deepep_moe: bool = False
|
303
|
+
enable_flashinfer_cutlass_moe: bool = False
|
304
|
+
enable_flashinfer_trtllm_moe: bool = False
|
305
|
+
enable_triton_kernel_moe: bool = False
|
306
|
+
enable_flashinfer_mxfp4_moe: bool = False
|
285
307
|
|
286
308
|
def __post_init__(self):
|
287
309
|
# Check deprecated arguments
|
288
|
-
def print_deprecated_warning(message: str):
|
289
|
-
logger.warning(f"\033[33m{message}\033[0m")
|
290
|
-
|
291
310
|
if self.enable_ep_moe:
|
292
311
|
self.ep_size = self.tp_size
|
293
312
|
print_deprecated_warning(
|
@@ -298,6 +317,26 @@ class ServerArgs:
|
|
298
317
|
print_deprecated_warning(
|
299
318
|
"NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
|
300
319
|
)
|
320
|
+
if self.enable_triton_kernel_moe:
|
321
|
+
self.moe_runner_backend = "triton_kernel"
|
322
|
+
print_deprecated_warning(
|
323
|
+
"NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
|
324
|
+
)
|
325
|
+
if self.enable_flashinfer_cutlass_moe:
|
326
|
+
self.moe_runner_backend = "flashinfer_cutlass"
|
327
|
+
print_deprecated_warning(
|
328
|
+
"NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead."
|
329
|
+
)
|
330
|
+
if self.enable_flashinfer_trtllm_moe:
|
331
|
+
self.moe_runner_backend = "flashinfer_trtllm"
|
332
|
+
print_deprecated_warning(
|
333
|
+
"NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead."
|
334
|
+
)
|
335
|
+
if self.enable_flashinfer_mxfp4_moe:
|
336
|
+
self.moe_runner_backend = "flashinfer_mxfp4"
|
337
|
+
print_deprecated_warning(
|
338
|
+
"NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead."
|
339
|
+
)
|
301
340
|
|
302
341
|
# Set missing default values
|
303
342
|
if self.tokenizer_path is None:
|
@@ -448,11 +487,6 @@ class ServerArgs:
|
|
448
487
|
)
|
449
488
|
self.page_size = 64
|
450
489
|
|
451
|
-
if self.speculative_algorithm is not None:
|
452
|
-
raise ValueError(
|
453
|
-
"trtllm_mla backend does not support speculative decoding yet."
|
454
|
-
)
|
455
|
-
|
456
490
|
if self.kv_cache_dtype not in ["fp8_e4m3", "auto"]:
|
457
491
|
raise ValueError(
|
458
492
|
"TensorRT-LLM MLA backend only supports kv-cache-dtype of fp8_e4m3 or auto."
|
@@ -474,11 +508,6 @@ class ServerArgs:
|
|
474
508
|
)
|
475
509
|
self.page_size = 64
|
476
510
|
|
477
|
-
if self.speculative_algorithm is not None:
|
478
|
-
raise ValueError(
|
479
|
-
"trtllm_mha backend does not support speculative decoding yet."
|
480
|
-
)
|
481
|
-
|
482
511
|
if self.attention_backend == "dual_chunk_flash_attn":
|
483
512
|
logger.warning(
|
484
513
|
"Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
|
@@ -517,17 +546,16 @@ class ServerArgs:
|
|
517
546
|
), "Please enable dp attention when setting enable_dp_lm_head. "
|
518
547
|
|
519
548
|
# MoE kernel
|
520
|
-
if self.
|
549
|
+
if self.moe_runner_backend == "flashinfer_cutlass":
|
521
550
|
assert (
|
522
551
|
self.quantization == "modelopt_fp4"
|
523
552
|
), "modelopt_fp4 quantization is required for Flashinfer MOE"
|
524
|
-
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
525
553
|
assert self.ep_size in [
|
526
554
|
1,
|
527
555
|
self.tp_size,
|
528
556
|
], "The expert parallel size must be 1 or the same as the tensor parallel size"
|
529
557
|
|
530
|
-
if self.
|
558
|
+
if self.moe_runner_backend == "flashinfer_trtllm":
|
531
559
|
if not self.disable_shared_experts_fusion:
|
532
560
|
self.disable_shared_experts_fusion = True
|
533
561
|
logger.warning(
|
@@ -556,7 +584,7 @@ class ServerArgs:
|
|
556
584
|
self.ep_dispatch_algorithm = "static"
|
557
585
|
|
558
586
|
if self.enable_eplb:
|
559
|
-
assert self.ep_size > 1
|
587
|
+
assert self.ep_size > 1
|
560
588
|
|
561
589
|
if self.enable_expert_distribution_metrics and (
|
562
590
|
self.expert_distribution_recorder_mode is None
|
@@ -611,6 +639,10 @@ class ServerArgs:
|
|
611
639
|
logger.warning(
|
612
640
|
"DeepSeek MTP does not require setting speculative_draft_model_path."
|
613
641
|
)
|
642
|
+
if self.page_size != 1 and self.attention_backend == "flashinfer":
|
643
|
+
raise ValueError(
|
644
|
+
"Speculative decoding with page_size != 1 is not supported. Please set page_size to 1."
|
645
|
+
)
|
614
646
|
|
615
647
|
# Auto choose parameters
|
616
648
|
if self.speculative_num_steps is None:
|
@@ -624,6 +656,16 @@ class ServerArgs:
|
|
624
656
|
self.speculative_num_draft_tokens,
|
625
657
|
) = auto_choose_speculative_params(self)
|
626
658
|
|
659
|
+
if (
|
660
|
+
self.attention_backend == "trtllm_mha"
|
661
|
+
or self.decode_attention_backend == "trtllm_mha"
|
662
|
+
or self.prefill_attention_backend == "trtllm_mha"
|
663
|
+
):
|
664
|
+
if self.speculative_eagle_topk > 1:
|
665
|
+
raise ValueError(
|
666
|
+
"trtllm_mha backend only supports topk = 1 for speculative decoding."
|
667
|
+
)
|
668
|
+
|
627
669
|
if (
|
628
670
|
self.speculative_eagle_topk == 1
|
629
671
|
and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
|
@@ -681,6 +723,12 @@ class ServerArgs:
|
|
681
723
|
"1" if self.disable_outlines_disk_cache else "0"
|
682
724
|
)
|
683
725
|
|
726
|
+
if self.enable_hierarchical_cache and self.disable_radix_cache:
|
727
|
+
raise ValueError(
|
728
|
+
"The arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive "
|
729
|
+
"and cannot be used at the same time. Please use only one of them."
|
730
|
+
)
|
731
|
+
|
684
732
|
@staticmethod
|
685
733
|
def add_cli_args(parser: argparse.ArgumentParser):
|
686
734
|
# Model and tokenizer
|
@@ -934,12 +982,6 @@ class ServerArgs:
|
|
934
982
|
default=ServerArgs.schedule_conservativeness,
|
935
983
|
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
|
936
984
|
)
|
937
|
-
parser.add_argument(
|
938
|
-
"--cpu-offload-gb",
|
939
|
-
type=int,
|
940
|
-
default=ServerArgs.cpu_offload_gb,
|
941
|
-
help="How many GBs of RAM to reserve for CPU offloading.",
|
942
|
-
)
|
943
985
|
parser.add_argument(
|
944
986
|
"--page-size",
|
945
987
|
type=int,
|
@@ -1132,6 +1174,12 @@ class ServerArgs:
|
|
1132
1174
|
default=ServerArgs.collect_tokens_histogram,
|
1133
1175
|
help="Collect prompt/generation tokens histogram.",
|
1134
1176
|
)
|
1177
|
+
parser.add_argument(
|
1178
|
+
"--gc-warning-threshold-secs",
|
1179
|
+
type=float,
|
1180
|
+
default=ServerArgs.gc_warning_threshold_secs,
|
1181
|
+
help="The threshold for long GC warning. If a GC takes longer than this, a warning will be logged. Set to 0 to disable.",
|
1182
|
+
)
|
1135
1183
|
parser.add_argument(
|
1136
1184
|
"--decode-log-interval",
|
1137
1185
|
type=int,
|
@@ -1200,23 +1248,13 @@ class ServerArgs:
|
|
1200
1248
|
default=ServerArgs.reasoning_parser,
|
1201
1249
|
help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
|
1202
1250
|
)
|
1251
|
+
tool_call_parser_choices = list(FunctionCallParser.ToolCallParserEnum.keys())
|
1203
1252
|
parser.add_argument(
|
1204
1253
|
"--tool-call-parser",
|
1205
1254
|
type=str,
|
1206
|
-
choices=
|
1207
|
-
"qwen25",
|
1208
|
-
"mistral",
|
1209
|
-
"llama3",
|
1210
|
-
"deepseekv3",
|
1211
|
-
"pythonic",
|
1212
|
-
"kimi_k2",
|
1213
|
-
"qwen3_coder",
|
1214
|
-
"glm45",
|
1215
|
-
"step3",
|
1216
|
-
"gpt-oss",
|
1217
|
-
],
|
1255
|
+
choices=tool_call_parser_choices,
|
1218
1256
|
default=ServerArgs.tool_call_parser,
|
1219
|
-
help="Specify the parser for handling tool-call interactions. Options include:
|
1257
|
+
help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
|
1220
1258
|
)
|
1221
1259
|
parser.add_argument(
|
1222
1260
|
"--tool-server",
|
@@ -1301,7 +1339,7 @@ class ServerArgs:
|
|
1301
1339
|
nargs="*",
|
1302
1340
|
default=None,
|
1303
1341
|
action=LoRAPathAction,
|
1304
|
-
help=
|
1342
|
+
help='The list of LoRA adapters to load. Each adapter must be specified in one of the following formats: <PATH> | <NAME>=<PATH> | JSON with schema {"lora_name":str,"lora_path":str,"pinned":bool}',
|
1305
1343
|
)
|
1306
1344
|
parser.add_argument(
|
1307
1345
|
"--max-loras-per-batch",
|
@@ -1446,19 +1484,30 @@ class ServerArgs:
|
|
1446
1484
|
parser.add_argument(
|
1447
1485
|
"--moe-a2a-backend",
|
1448
1486
|
type=str,
|
1449
|
-
choices=["deepep"],
|
1487
|
+
choices=["none", "deepep"],
|
1450
1488
|
default=ServerArgs.moe_a2a_backend,
|
1451
1489
|
help="Choose the backend for MoE A2A.",
|
1452
1490
|
)
|
1453
1491
|
parser.add_argument(
|
1454
|
-
"--
|
1455
|
-
|
1456
|
-
|
1492
|
+
"--moe-runner-backend",
|
1493
|
+
type=str,
|
1494
|
+
choices=[
|
1495
|
+
"auto",
|
1496
|
+
"triton",
|
1497
|
+
"triton_kernel",
|
1498
|
+
"flashinfer_trtllm",
|
1499
|
+
"flashinfer_cutlass",
|
1500
|
+
"flashinfer_mxfp4",
|
1501
|
+
],
|
1502
|
+
default=ServerArgs.moe_runner_backend,
|
1503
|
+
help="Choose the runner backend for MoE.",
|
1457
1504
|
)
|
1458
1505
|
parser.add_argument(
|
1459
|
-
"--
|
1460
|
-
|
1461
|
-
|
1506
|
+
"--flashinfer-mxfp4-moe-precision",
|
1507
|
+
type=str,
|
1508
|
+
choices=["mxfp4", "bf16"],
|
1509
|
+
default=ServerArgs.flashinfer_mxfp4_moe_precision,
|
1510
|
+
help="Choose the computation precision of flashinfer mxfp4 moe",
|
1462
1511
|
)
|
1463
1512
|
parser.add_argument(
|
1464
1513
|
"--enable-flashinfer-allreduce-fusion",
|
@@ -1634,6 +1683,38 @@ class ServerArgs:
|
|
1634
1683
|
help="The type of heavy channels in double sparsity attention",
|
1635
1684
|
)
|
1636
1685
|
|
1686
|
+
# Offloading
|
1687
|
+
parser.add_argument(
|
1688
|
+
"--cpu-offload-gb",
|
1689
|
+
type=int,
|
1690
|
+
default=ServerArgs.cpu_offload_gb,
|
1691
|
+
help="How many GBs of RAM to reserve for CPU offloading.",
|
1692
|
+
)
|
1693
|
+
parser.add_argument(
|
1694
|
+
"--offload-group-size",
|
1695
|
+
type=int,
|
1696
|
+
default=ServerArgs.offload_group_size,
|
1697
|
+
help="Number of layers per group in offloading.",
|
1698
|
+
)
|
1699
|
+
parser.add_argument(
|
1700
|
+
"--offload-num-in-group",
|
1701
|
+
type=int,
|
1702
|
+
default=ServerArgs.offload_num_in_group,
|
1703
|
+
help="Number of layers to be offloaded within a group.",
|
1704
|
+
)
|
1705
|
+
parser.add_argument(
|
1706
|
+
"--offload-prefetch-step",
|
1707
|
+
type=int,
|
1708
|
+
default=ServerArgs.offload_prefetch_step,
|
1709
|
+
help="Steps to prefetch in offloading.",
|
1710
|
+
)
|
1711
|
+
parser.add_argument(
|
1712
|
+
"--offload-mode",
|
1713
|
+
type=str,
|
1714
|
+
default=ServerArgs.offload_mode,
|
1715
|
+
help="Mode of offloading.",
|
1716
|
+
)
|
1717
|
+
|
1637
1718
|
# Optimization/debug options
|
1638
1719
|
parser.add_argument(
|
1639
1720
|
"--disable-radix-cache",
|
@@ -1682,6 +1763,11 @@ class ServerArgs:
|
|
1682
1763
|
action="store_true",
|
1683
1764
|
help="Enable NCCL symmetric memory for fast collectives.",
|
1684
1765
|
)
|
1766
|
+
parser.add_argument(
|
1767
|
+
"--disable-flashinfer-cutlass-moe-fp4-allgather",
|
1768
|
+
action="store_true",
|
1769
|
+
help="Disables quantize before all-gather for flashinfer cutlass moe.",
|
1770
|
+
)
|
1685
1771
|
parser.add_argument(
|
1686
1772
|
"--enable-tokenizer-batch-encode",
|
1687
1773
|
action="store_true",
|
@@ -1825,16 +1911,6 @@ class ServerArgs:
|
|
1825
1911
|
action="store_true",
|
1826
1912
|
help="Enable returning hidden states with responses.",
|
1827
1913
|
)
|
1828
|
-
parser.add_argument(
|
1829
|
-
"--enable-triton-kernel-moe",
|
1830
|
-
action="store_true",
|
1831
|
-
help="Use triton moe grouped gemm kernel.",
|
1832
|
-
)
|
1833
|
-
parser.add_argument(
|
1834
|
-
"--enable-flashinfer-mxfp4-moe",
|
1835
|
-
action="store_true",
|
1836
|
-
help="Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
|
1837
|
-
)
|
1838
1914
|
parser.add_argument(
|
1839
1915
|
"--scheduler-recv-interval",
|
1840
1916
|
type=int,
|
@@ -1935,24 +2011,25 @@ class ServerArgs:
|
|
1935
2011
|
default=None,
|
1936
2012
|
help="The custom dataloader which used to update the model. Should be set with a valid import path, such as my_package.weight_load_func",
|
1937
2013
|
)
|
2014
|
+
parser.add_argument(
|
2015
|
+
"--weight-loader-disable-mmap",
|
2016
|
+
action="store_true",
|
2017
|
+
help="Disable mmap while loading weight using safetensors.",
|
2018
|
+
)
|
2019
|
+
|
2020
|
+
# For PD-Multiplexing
|
1938
2021
|
parser.add_argument(
|
1939
2022
|
"--enable-pdmux",
|
1940
2023
|
action="store_true",
|
1941
2024
|
help="Enable PD-Multiplexing, PD running on greenctx stream.",
|
1942
2025
|
)
|
1943
2026
|
|
1944
|
-
# For PD-Multiplexing
|
1945
2027
|
parser.add_argument(
|
1946
2028
|
"--sm-group-num",
|
1947
2029
|
type=int,
|
1948
2030
|
default=ServerArgs.sm_group_num,
|
1949
2031
|
help="Number of sm partition groups.",
|
1950
2032
|
)
|
1951
|
-
parser.add_argument(
|
1952
|
-
"--weight-loader-disable-mmap",
|
1953
|
-
action="store_true",
|
1954
|
-
help="Disable mmap while loading weight using safetensors.",
|
1955
|
-
)
|
1956
2033
|
|
1957
2034
|
# Deprecated arguments
|
1958
2035
|
parser.add_argument(
|
@@ -1965,6 +2042,26 @@ class ServerArgs:
|
|
1965
2042
|
action="store_true",
|
1966
2043
|
help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
|
1967
2044
|
)
|
2045
|
+
parser.add_argument(
|
2046
|
+
"--enable-flashinfer-cutlass-moe",
|
2047
|
+
action="store_true",
|
2048
|
+
help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
|
2049
|
+
)
|
2050
|
+
parser.add_argument(
|
2051
|
+
"--enable-flashinfer-trtllm-moe",
|
2052
|
+
action="store_true",
|
2053
|
+
help="(Deprecated) Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
|
2054
|
+
)
|
2055
|
+
parser.add_argument(
|
2056
|
+
"--enable-triton-kernel-moe",
|
2057
|
+
action="store_true",
|
2058
|
+
help="(Deprecated) Use triton moe grouped gemm kernel.",
|
2059
|
+
)
|
2060
|
+
parser.add_argument(
|
2061
|
+
"--enable-flashinfer-mxfp4-moe",
|
2062
|
+
action="store_true",
|
2063
|
+
help="(Deprecated) Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
|
2064
|
+
)
|
1968
2065
|
|
1969
2066
|
@classmethod
|
1970
2067
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -2049,28 +2146,42 @@ class ServerArgs:
|
|
2049
2146
|
)
|
2050
2147
|
|
2051
2148
|
if self.enable_lora:
|
2052
|
-
# Normalize lora_paths to a dictionary if it is a list.
|
2053
|
-
# TODO (lifuhuang): support specifying pinned adapters in server_args.
|
2054
2149
|
if isinstance(self.lora_paths, list):
|
2055
2150
|
lora_paths = self.lora_paths
|
2056
|
-
self.lora_paths =
|
2151
|
+
self.lora_paths = []
|
2057
2152
|
for lora_path in lora_paths:
|
2058
|
-
if
|
2059
|
-
|
2060
|
-
|
2061
|
-
|
2153
|
+
if isinstance(lora_path, str):
|
2154
|
+
if "=" in lora_path:
|
2155
|
+
name, path = lora_path.split("=", 1)
|
2156
|
+
lora_ref = LoRARef(
|
2157
|
+
lora_name=name, lora_path=path, pinned=False
|
2158
|
+
)
|
2159
|
+
else:
|
2160
|
+
lora_ref = LoRARef(
|
2161
|
+
lora_name=lora_path, lora_path=lora_path, pinned=False
|
2162
|
+
)
|
2163
|
+
elif isinstance(lora_path, dict):
|
2164
|
+
assert (
|
2165
|
+
"lora_name" in lora_path and "lora_path" in lora_path
|
2166
|
+
), f"When providing LoRA paths as a list of dict, each dict should contain 'lora_name' and 'lora_path' keys. Got: {lora_path}"
|
2167
|
+
lora_ref = LoRARef(
|
2168
|
+
lora_name=lora_path["lora_name"],
|
2169
|
+
lora_path=lora_path["lora_path"],
|
2170
|
+
pinned=lora_path.get("pinned", False),
|
2062
2171
|
)
|
2063
2172
|
else:
|
2064
|
-
|
2065
|
-
|
2173
|
+
raise ValueError(
|
2174
|
+
f"Invalid type for item in --lora-paths list: {type(lora_path)}. "
|
2175
|
+
"Expected a string or a dictionary."
|
2066
2176
|
)
|
2177
|
+
self.lora_paths.append(lora_ref)
|
2067
2178
|
elif isinstance(self.lora_paths, dict):
|
2068
|
-
self.lora_paths =
|
2069
|
-
|
2179
|
+
self.lora_paths = [
|
2180
|
+
LoRARef(lora_name=k, lora_path=v, pinned=False)
|
2070
2181
|
for k, v in self.lora_paths.items()
|
2071
|
-
|
2182
|
+
]
|
2072
2183
|
elif self.lora_paths is None:
|
2073
|
-
self.lora_paths =
|
2184
|
+
self.lora_paths = []
|
2074
2185
|
else:
|
2075
2186
|
raise ValueError(
|
2076
2187
|
f"Invalid type for --lora-paths: {type(self.lora_paths)}. "
|
@@ -2097,9 +2208,7 @@ class ServerArgs:
|
|
2097
2208
|
"max_loaded_loras should be greater than or equal to max_loras_per_batch. "
|
2098
2209
|
f"max_loaded_loras={self.max_loaded_loras}, max_loras_per_batch={self.max_loras_per_batch}"
|
2099
2210
|
)
|
2100
|
-
assert (
|
2101
|
-
not self.lora_paths or len(self.lora_paths) <= self.max_loaded_loras
|
2102
|
-
), (
|
2211
|
+
assert len(self.lora_paths) <= self.max_loaded_loras, (
|
2103
2212
|
"The number of LoRA paths should not exceed max_loaded_loras. "
|
2104
2213
|
f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
|
2105
2214
|
)
|
@@ -2117,9 +2226,9 @@ class ServerArgs:
|
|
2117
2226
|
model_arch = hf_config.architectures[0]
|
2118
2227
|
if model_arch in ["GptOssForCausalLM"]:
|
2119
2228
|
if self.attention_backend is None:
|
2120
|
-
if is_sm100_supported():
|
2229
|
+
if is_cuda() and is_sm100_supported():
|
2121
2230
|
self.attention_backend = "trtllm_mha"
|
2122
|
-
elif is_sm90_supported():
|
2231
|
+
elif is_cuda() and is_sm90_supported():
|
2123
2232
|
self.attention_backend = "fa3"
|
2124
2233
|
else:
|
2125
2234
|
self.attention_backend = "triton"
|
@@ -2132,10 +2241,11 @@ class ServerArgs:
|
|
2132
2241
|
), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
|
2133
2242
|
|
2134
2243
|
if is_sm100_supported():
|
2135
|
-
self.
|
2136
|
-
|
2137
|
-
|
2138
|
-
|
2244
|
+
if not self.enable_dp_attention:
|
2245
|
+
self.enable_flashinfer_allreduce_fusion = True
|
2246
|
+
logger.info(
|
2247
|
+
"Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
|
2248
|
+
)
|
2139
2249
|
quantization_config = getattr(hf_config, "quantization_config", None)
|
2140
2250
|
is_mxfp4_quant_format = (
|
2141
2251
|
quantization_config is not None
|
@@ -2143,18 +2253,21 @@ class ServerArgs:
|
|
2143
2253
|
)
|
2144
2254
|
|
2145
2255
|
if is_sm100_supported() and is_mxfp4_quant_format:
|
2146
|
-
self.
|
2147
|
-
self.enable_triton_kernel_moe = False
|
2256
|
+
self.moe_runner_backend = "flashinfer_mxfp4"
|
2148
2257
|
logger.warning(
|
2149
2258
|
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
|
2150
2259
|
)
|
2151
2260
|
else:
|
2152
|
-
if self.
|
2261
|
+
if self.moe_runner_backend == "triton_kernel":
|
2153
2262
|
assert (
|
2154
2263
|
self.ep_size == 1
|
2155
2264
|
), "Triton kernel MoE is only supported when ep_size == 1"
|
2156
|
-
if
|
2157
|
-
self.
|
2265
|
+
if (
|
2266
|
+
self.moe_runner_backend == "auto"
|
2267
|
+
and self.ep_size == 1
|
2268
|
+
and is_triton_kernels_available()
|
2269
|
+
):
|
2270
|
+
self.moe_runner_backend = "triton_kernel"
|
2158
2271
|
logger.warning(
|
2159
2272
|
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
|
2160
2273
|
)
|
@@ -2163,7 +2276,10 @@ class ServerArgs:
|
|
2163
2276
|
# use bf16 for mxfp4 triton kernels
|
2164
2277
|
self.dtype = "bfloat16"
|
2165
2278
|
elif "Llama4" in model_arch:
|
2166
|
-
assert self.attention_backend
|
2279
|
+
assert self.attention_backend in {
|
2280
|
+
"fa3",
|
2281
|
+
"aiter",
|
2282
|
+
}, "fa3 or aiter is required for Llama4 model"
|
2167
2283
|
elif model_arch in [
|
2168
2284
|
"Gemma2ForCausalLM",
|
2169
2285
|
"Gemma3ForCausalLM",
|
@@ -2317,13 +2433,22 @@ class PortArgs:
|
|
2317
2433
|
|
2318
2434
|
class LoRAPathAction(argparse.Action):
|
2319
2435
|
def __call__(self, parser, namespace, values, option_string=None):
|
2320
|
-
|
2321
|
-
|
2322
|
-
|
2323
|
-
|
2324
|
-
|
2325
|
-
|
2326
|
-
|
2436
|
+
lora_paths = []
|
2437
|
+
if values:
|
2438
|
+
assert isinstance(values, list), "Expected a list of LoRA paths."
|
2439
|
+
for lora_path in values:
|
2440
|
+
lora_path = lora_path.strip()
|
2441
|
+
if lora_path.startswith("{") and lora_path.endswith("}"):
|
2442
|
+
obj = json.loads(lora_path)
|
2443
|
+
assert "lora_path" in obj and "lora_name" in obj, (
|
2444
|
+
f"{repr(lora_path)} looks like a JSON str, "
|
2445
|
+
"but it does not contain 'lora_name' and 'lora_path' keys."
|
2446
|
+
)
|
2447
|
+
lora_paths.append(obj)
|
2448
|
+
else:
|
2449
|
+
lora_paths.append(lora_path)
|
2450
|
+
|
2451
|
+
setattr(namespace, self.dest, lora_paths)
|
2327
2452
|
|
2328
2453
|
|
2329
2454
|
class DeprecatedAction(argparse.Action):
|
@@ -2336,6 +2461,10 @@ class DeprecatedAction(argparse.Action):
|
|
2336
2461
|
raise ValueError(self.help)
|
2337
2462
|
|
2338
2463
|
|
2464
|
+
def print_deprecated_warning(message: str):
|
2465
|
+
logger.warning(f"\033[33m{message}\033[0m")
|
2466
|
+
|
2467
|
+
|
2339
2468
|
def auto_choose_speculative_params(self: ServerArgs):
|
2340
2469
|
"""
|
2341
2470
|
Automatically choose the parameters for speculative decoding.
|
@@ -2348,8 +2477,12 @@ def auto_choose_speculative_params(self: ServerArgs):
|
|
2348
2477
|
if arch in ["LlamaForCausalLM"]:
|
2349
2478
|
# The default value for llama
|
2350
2479
|
return (5, 4, 8)
|
2351
|
-
elif arch in [
|
2352
|
-
|
2480
|
+
elif arch in [
|
2481
|
+
"DeepseekV3ForCausalLM",
|
2482
|
+
"DeepseekV2ForCausalLM",
|
2483
|
+
"GptOssForCausalLM",
|
2484
|
+
]:
|
2485
|
+
# The default value for deepseek and gpt-oss
|
2353
2486
|
return (3, 1, 4)
|
2354
2487
|
elif arch in ["Grok1ForCausalLM", "Grok1VForCausalLM"]:
|
2355
2488
|
return (5, 4, 8)
|
@@ -41,6 +41,7 @@ class EAGLEDraftCudaGraphRunner:
|
|
41
41
|
# Parse args
|
42
42
|
self.eagle_worker = eagle_worker
|
43
43
|
self.model_runner = model_runner = eagle_worker.model_runner
|
44
|
+
self.model_runner: EAGLEWorker
|
44
45
|
self.graphs = {}
|
45
46
|
self.output_buffers = {}
|
46
47
|
self.enable_torch_compile = model_runner.server_args.enable_torch_compile
|