sglang 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +0 -7
- sglang/bench_one_batch_server.py +7 -2
- sglang/bench_serving.py +3 -3
- sglang/eval/llama3_eval.py +0 -1
- sglang/srt/configs/model_config.py +25 -9
- sglang/srt/configs/update_config.py +40 -5
- sglang/srt/constrained/xgrammar_backend.py +23 -11
- sglang/srt/conversation.py +2 -15
- sglang/srt/disaggregation/ascend/conn.py +1 -3
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +1 -2
- sglang/srt/disaggregation/launch_lb.py +7 -1
- sglang/srt/disaggregation/mini_lb.py +11 -5
- sglang/srt/disaggregation/mooncake/conn.py +141 -47
- sglang/srt/disaggregation/prefill.py +261 -5
- sglang/srt/disaggregation/utils.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
- sglang/srt/distributed/device_communicators/pynccl.py +68 -18
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
- sglang/srt/distributed/naive_distributed.py +112 -0
- sglang/srt/distributed/parallel_state.py +90 -4
- sglang/srt/entrypoints/context.py +20 -1
- sglang/srt/entrypoints/engine.py +29 -4
- sglang/srt/entrypoints/http_server.py +76 -0
- sglang/srt/entrypoints/openai/protocol.py +4 -2
- sglang/srt/entrypoints/openai/serving_chat.py +23 -6
- sglang/srt/entrypoints/openai/serving_completions.py +10 -1
- sglang/srt/entrypoints/openai/serving_responses.py +2 -2
- sglang/srt/eplb/expert_distribution.py +2 -3
- sglang/srt/function_call/deepseekv3_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +24 -0
- sglang/srt/host_shared_memory.py +83 -0
- sglang/srt/layers/attention/ascend_backend.py +132 -22
- sglang/srt/layers/attention/flashattention_backend.py +24 -17
- sglang/srt/layers/attention/flashinfer_backend.py +14 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +227 -76
- sglang/srt/layers/attention/triton_backend.py +109 -73
- sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
- sglang/srt/layers/attention/trtllm_mha_backend.py +398 -36
- sglang/srt/layers/attention/trtllm_mla_backend.py +49 -19
- sglang/srt/layers/attention/utils.py +94 -15
- sglang/srt/layers/attention/vision.py +40 -13
- sglang/srt/layers/attention/vision_utils.py +65 -0
- sglang/srt/layers/communicator.py +58 -10
- sglang/srt/layers/dp_attention.py +137 -27
- sglang/srt/layers/elementwise.py +94 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
- sglang/srt/layers/layernorm.py +8 -1
- sglang/srt/layers/linear.py +24 -0
- sglang/srt/layers/logits_processor.py +16 -18
- sglang/srt/layers/moe/__init__.py +31 -0
- sglang/srt/layers/moe/ep_moe/layer.py +37 -33
- sglang/srt/layers/moe/fused_moe_native.py +14 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
- sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
- sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
- sglang/srt/layers/moe/moe_runner/base.py +13 -0
- sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
- sglang/srt/layers/moe/router.py +15 -9
- sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
- sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
- sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
- sglang/srt/layers/moe/topk.py +167 -83
- sglang/srt/layers/moe/utils.py +159 -18
- sglang/srt/layers/multimodal.py +156 -40
- sglang/srt/layers/quantization/__init__.py +18 -46
- sglang/srt/layers/quantization/awq.py +22 -23
- sglang/srt/layers/quantization/base_config.py +2 -6
- sglang/srt/layers/quantization/blockwise_int8.py +4 -12
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -29
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
- sglang/srt/layers/quantization/fp8.py +127 -119
- sglang/srt/layers/quantization/fp8_kernel.py +195 -24
- sglang/srt/layers/quantization/fp8_utils.py +34 -9
- sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
- sglang/srt/layers/quantization/gptq.py +17 -21
- sglang/srt/layers/quantization/marlin_utils.py +26 -8
- sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
- sglang/srt/layers/quantization/modelopt_quant.py +217 -98
- sglang/srt/layers/quantization/moe_wna16.py +10 -15
- sglang/srt/layers/quantization/mxfp4.py +222 -39
- sglang/srt/layers/quantization/quark/quark.py +390 -0
- sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
- sglang/srt/layers/quantization/unquant.py +34 -70
- sglang/srt/layers/quantization/utils.py +77 -2
- sglang/srt/layers/quantization/w4afp8.py +7 -8
- sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
- sglang/srt/layers/quantization/w8a8_int8.py +5 -13
- sglang/srt/layers/radix_attention.py +6 -0
- sglang/srt/layers/rotary_embedding.py +1 -0
- sglang/srt/layers/sampler.py +5 -2
- sglang/srt/lora/layers.py +6 -2
- sglang/srt/lora/lora_manager.py +21 -22
- sglang/srt/lora/lora_registry.py +3 -3
- sglang/srt/lora/mem_pool.py +26 -24
- sglang/srt/lora/utils.py +10 -12
- sglang/srt/managers/cache_controller.py +80 -19
- sglang/srt/managers/detokenizer_manager.py +10 -2
- sglang/srt/managers/io_struct.py +23 -0
- sglang/srt/managers/mm_utils.py +1 -1
- sglang/srt/managers/schedule_batch.py +22 -48
- sglang/srt/managers/scheduler.py +28 -20
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/template_manager.py +7 -5
- sglang/srt/managers/tokenizer_manager.py +88 -39
- sglang/srt/managers/tp_worker.py +1 -0
- sglang/srt/managers/utils.py +59 -1
- sglang/srt/mem_cache/allocator.py +10 -157
- sglang/srt/mem_cache/allocator_ascend.py +147 -0
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +14 -4
- sglang/srt/mem_cache/memory_pool.py +3 -3
- sglang/srt/mem_cache/memory_pool_host.py +35 -2
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
- sglang/srt/model_executor/cuda_graph_runner.py +33 -33
- sglang/srt/model_executor/forward_batch_info.py +11 -10
- sglang/srt/model_executor/model_runner.py +93 -78
- sglang/srt/model_executor/npu_graph_runner.py +94 -0
- sglang/srt/model_loader/loader.py +24 -6
- sglang/srt/models/dbrx.py +12 -6
- sglang/srt/models/deepseek.py +2 -1
- sglang/srt/models/deepseek_nextn.py +5 -2
- sglang/srt/models/deepseek_v2.py +226 -223
- sglang/srt/models/ernie4.py +2 -2
- sglang/srt/models/glm4_moe.py +27 -65
- sglang/srt/models/glm4_moe_nextn.py +2 -1
- sglang/srt/models/glm4v.py +52 -1
- sglang/srt/models/glm4v_moe.py +8 -11
- sglang/srt/models/gpt_oss.py +41 -76
- sglang/srt/models/granitemoe.py +0 -1
- sglang/srt/models/grok.py +376 -48
- sglang/srt/models/interns1.py +12 -47
- sglang/srt/models/internvl.py +6 -51
- sglang/srt/models/llama.py +10 -2
- sglang/srt/models/llama4.py +18 -7
- sglang/srt/models/minicpm3.py +0 -1
- sglang/srt/models/mixtral.py +0 -2
- sglang/srt/models/nemotron_nas.py +435 -0
- sglang/srt/models/olmoe.py +0 -1
- sglang/srt/models/phi4mm.py +3 -21
- sglang/srt/models/qwen2.py +2 -2
- sglang/srt/models/qwen2_5_vl.py +2 -0
- sglang/srt/models/qwen2_moe.py +23 -23
- sglang/srt/models/qwen3.py +2 -2
- sglang/srt/models/qwen3_classification.py +84 -0
- sglang/srt/models/qwen3_moe.py +27 -43
- sglang/srt/models/step3_vl.py +8 -3
- sglang/srt/models/xverse_moe.py +11 -5
- sglang/srt/multimodal/processors/base_processor.py +3 -3
- sglang/srt/multimodal/processors/internvl.py +7 -2
- sglang/srt/multimodal/processors/llava.py +11 -7
- sglang/srt/offloader.py +433 -0
- sglang/srt/operations.py +22 -2
- sglang/srt/reasoning_parser.py +4 -3
- sglang/srt/sampling/sampling_batch_info.py +7 -4
- sglang/srt/server_args.py +264 -105
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -21
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
- sglang/srt/speculative/eagle_utils.py +36 -13
- sglang/srt/speculative/eagle_worker.py +56 -3
- sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
- sglang/srt/two_batch_overlap.py +20 -19
- sglang/srt/utils.py +68 -70
- sglang/test/runners.py +8 -5
- sglang/test/test_block_fp8.py +5 -6
- sglang/test/test_block_fp8_ep.py +13 -19
- sglang/test/test_cutlass_moe.py +4 -6
- sglang/test/test_cutlass_w4a8_moe.py +4 -3
- sglang/test/test_fp4_moe.py +4 -3
- sglang/test/test_marlin_moe.py +1 -1
- sglang/test/test_marlin_utils.py +1 -1
- sglang/test/test_utils.py +7 -0
- sglang/utils.py +0 -1
- sglang/version.py +1 -1
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/METADATA +11 -11
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/RECORD +201 -171
- sglang/srt/layers/quantization/fp4.py +0 -557
- sglang/srt/layers/quantization/scalar_type.py +0 -352
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -23,8 +23,9 @@ import sys
|
|
23
23
|
import tempfile
|
24
24
|
from typing import List, Literal, Optional, Union
|
25
25
|
|
26
|
+
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
26
27
|
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
27
|
-
from sglang.srt.layers.utils import is_sm100_supported
|
28
|
+
from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported
|
28
29
|
from sglang.srt.lora.lora_registry import LoRARef
|
29
30
|
from sglang.srt.reasoning_parser import ReasoningParser
|
30
31
|
from sglang.srt.utils import (
|
@@ -33,10 +34,12 @@ from sglang.srt.utils import (
|
|
33
34
|
configure_ipv6,
|
34
35
|
get_device,
|
35
36
|
get_device_memory_capacity,
|
37
|
+
is_cuda,
|
36
38
|
is_flashinfer_available,
|
37
39
|
is_hip,
|
38
40
|
is_port_available,
|
39
41
|
is_remote_url,
|
42
|
+
is_triton_kernels_available,
|
40
43
|
is_valid_ipv6_address,
|
41
44
|
nullable_str,
|
42
45
|
)
|
@@ -82,7 +85,6 @@ class ServerArgs:
|
|
82
85
|
max_prefill_tokens: int = 16384
|
83
86
|
schedule_policy: str = "fcfs"
|
84
87
|
schedule_conservativeness: float = 1.0
|
85
|
-
cpu_offload_gb: int = 0
|
86
88
|
page_size: Optional[int] = None
|
87
89
|
hybrid_kvcache_ratio: Optional[float] = None
|
88
90
|
swa_full_tokens_ratio: float = 0.8
|
@@ -120,10 +122,12 @@ class ServerArgs:
|
|
120
122
|
decode_log_interval: int = 40
|
121
123
|
enable_request_time_stats_logging: bool = False
|
122
124
|
kv_events_config: Optional[str] = None
|
125
|
+
gc_warning_threshold_secs: float = 0.0
|
123
126
|
|
124
127
|
# API related
|
125
128
|
api_key: Optional[str] = None
|
126
129
|
served_model_name: Optional[str] = None
|
130
|
+
weight_version: str = "default"
|
127
131
|
chat_template: Optional[str] = None
|
128
132
|
completion_template: Optional[str] = None
|
129
133
|
file_storage_path: str = "sglang_storage"
|
@@ -149,7 +153,9 @@ class ServerArgs:
|
|
149
153
|
enable_lora: Optional[bool] = None
|
150
154
|
max_lora_rank: Optional[int] = None
|
151
155
|
lora_target_modules: Optional[Union[set[str], List[str]]] = None
|
152
|
-
lora_paths: Optional[
|
156
|
+
lora_paths: Optional[
|
157
|
+
Union[dict[str, str], List[dict[str, str]], List[str], List[LoRARef]]
|
158
|
+
] = None
|
153
159
|
max_loaded_loras: Optional[int] = None
|
154
160
|
max_loras_per_batch: int = 8
|
155
161
|
lora_backend: str = "triton"
|
@@ -174,9 +180,16 @@ class ServerArgs:
|
|
174
180
|
|
175
181
|
# Expert parallelism
|
176
182
|
ep_size: int = 1
|
177
|
-
moe_a2a_backend:
|
178
|
-
|
179
|
-
|
183
|
+
moe_a2a_backend: Literal["none", "deepep"] = "none"
|
184
|
+
moe_runner_backend: Literal[
|
185
|
+
"auto",
|
186
|
+
"triton",
|
187
|
+
"triton_kernel",
|
188
|
+
"flashinfer_trtllm",
|
189
|
+
"flashinfer_cutlass",
|
190
|
+
"flashinfer_mxfp4",
|
191
|
+
] = "auto"
|
192
|
+
flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
|
180
193
|
enable_flashinfer_allreduce_fusion: bool = False
|
181
194
|
deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
|
182
195
|
ep_num_redundant_experts: int = 0
|
@@ -212,6 +225,13 @@ class ServerArgs:
|
|
212
225
|
ds_heavy_channel_type: str = "qk"
|
213
226
|
ds_sparse_decode_threshold: int = 4096
|
214
227
|
|
228
|
+
# Offloading
|
229
|
+
cpu_offload_gb: int = 0
|
230
|
+
offload_group_size: int = -1
|
231
|
+
offload_num_in_group: int = 1
|
232
|
+
offload_prefetch_step: int = 1
|
233
|
+
offload_mode: str = "cpu"
|
234
|
+
|
215
235
|
# Optimization/debug options
|
216
236
|
disable_radix_cache: bool = False
|
217
237
|
cuda_graph_max_bs: Optional[int] = None
|
@@ -222,6 +242,7 @@ class ServerArgs:
|
|
222
242
|
enable_cudagraph_gc: bool = False
|
223
243
|
enable_nccl_nvls: bool = False
|
224
244
|
enable_symm_mem: bool = False
|
245
|
+
disable_flashinfer_cutlass_moe_fp4_allgather: bool = False
|
225
246
|
enable_tokenizer_batch_encode: bool = False
|
226
247
|
disable_outlines_disk_cache: bool = False
|
227
248
|
disable_custom_all_reduce: bool = False
|
@@ -249,8 +270,6 @@ class ServerArgs:
|
|
249
270
|
disable_chunked_prefix_cache: bool = False
|
250
271
|
disable_fast_image_processor: bool = False
|
251
272
|
enable_return_hidden_states: bool = False
|
252
|
-
enable_triton_kernel_moe: bool = False
|
253
|
-
enable_flashinfer_mxfp4_moe: bool = False
|
254
273
|
scheduler_recv_interval: int = 1
|
255
274
|
|
256
275
|
# Debug tensor dumps
|
@@ -281,12 +300,13 @@ class ServerArgs:
|
|
281
300
|
# Deprecated arguments
|
282
301
|
enable_ep_moe: bool = False
|
283
302
|
enable_deepep_moe: bool = False
|
303
|
+
enable_flashinfer_cutlass_moe: bool = False
|
304
|
+
enable_flashinfer_trtllm_moe: bool = False
|
305
|
+
enable_triton_kernel_moe: bool = False
|
306
|
+
enable_flashinfer_mxfp4_moe: bool = False
|
284
307
|
|
285
308
|
def __post_init__(self):
|
286
309
|
# Check deprecated arguments
|
287
|
-
def print_deprecated_warning(message: str):
|
288
|
-
logger.warning(f"\033[33m{message}\033[0m")
|
289
|
-
|
290
310
|
if self.enable_ep_moe:
|
291
311
|
self.ep_size = self.tp_size
|
292
312
|
print_deprecated_warning(
|
@@ -297,6 +317,26 @@ class ServerArgs:
|
|
297
317
|
print_deprecated_warning(
|
298
318
|
"NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
|
299
319
|
)
|
320
|
+
if self.enable_triton_kernel_moe:
|
321
|
+
self.moe_runner_backend = "triton_kernel"
|
322
|
+
print_deprecated_warning(
|
323
|
+
"NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
|
324
|
+
)
|
325
|
+
if self.enable_flashinfer_cutlass_moe:
|
326
|
+
self.moe_runner_backend = "flashinfer_cutlass"
|
327
|
+
print_deprecated_warning(
|
328
|
+
"NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead."
|
329
|
+
)
|
330
|
+
if self.enable_flashinfer_trtllm_moe:
|
331
|
+
self.moe_runner_backend = "flashinfer_trtllm"
|
332
|
+
print_deprecated_warning(
|
333
|
+
"NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead."
|
334
|
+
)
|
335
|
+
if self.enable_flashinfer_mxfp4_moe:
|
336
|
+
self.moe_runner_backend = "flashinfer_mxfp4"
|
337
|
+
print_deprecated_warning(
|
338
|
+
"NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead."
|
339
|
+
)
|
300
340
|
|
301
341
|
# Set missing default values
|
302
342
|
if self.tokenizer_path is None:
|
@@ -447,11 +487,6 @@ class ServerArgs:
|
|
447
487
|
)
|
448
488
|
self.page_size = 64
|
449
489
|
|
450
|
-
if self.speculative_algorithm is not None:
|
451
|
-
raise ValueError(
|
452
|
-
"trtllm_mla backend does not support speculative decoding yet."
|
453
|
-
)
|
454
|
-
|
455
490
|
if self.kv_cache_dtype not in ["fp8_e4m3", "auto"]:
|
456
491
|
raise ValueError(
|
457
492
|
"TensorRT-LLM MLA backend only supports kv-cache-dtype of fp8_e4m3 or auto."
|
@@ -473,11 +508,6 @@ class ServerArgs:
|
|
473
508
|
)
|
474
509
|
self.page_size = 64
|
475
510
|
|
476
|
-
if self.speculative_algorithm is not None:
|
477
|
-
raise ValueError(
|
478
|
-
"trtllm_mha backend does not support speculative decoding yet."
|
479
|
-
)
|
480
|
-
|
481
511
|
if self.attention_backend == "dual_chunk_flash_attn":
|
482
512
|
logger.warning(
|
483
513
|
"Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
|
@@ -516,17 +546,16 @@ class ServerArgs:
|
|
516
546
|
), "Please enable dp attention when setting enable_dp_lm_head. "
|
517
547
|
|
518
548
|
# MoE kernel
|
519
|
-
if self.
|
549
|
+
if self.moe_runner_backend == "flashinfer_cutlass":
|
520
550
|
assert (
|
521
551
|
self.quantization == "modelopt_fp4"
|
522
552
|
), "modelopt_fp4 quantization is required for Flashinfer MOE"
|
523
|
-
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
524
553
|
assert self.ep_size in [
|
525
554
|
1,
|
526
555
|
self.tp_size,
|
527
556
|
], "The expert parallel size must be 1 or the same as the tensor parallel size"
|
528
557
|
|
529
|
-
if self.
|
558
|
+
if self.moe_runner_backend == "flashinfer_trtllm":
|
530
559
|
if not self.disable_shared_experts_fusion:
|
531
560
|
self.disable_shared_experts_fusion = True
|
532
561
|
logger.warning(
|
@@ -555,7 +584,7 @@ class ServerArgs:
|
|
555
584
|
self.ep_dispatch_algorithm = "static"
|
556
585
|
|
557
586
|
if self.enable_eplb:
|
558
|
-
assert self.ep_size > 1
|
587
|
+
assert self.ep_size > 1
|
559
588
|
|
560
589
|
if self.enable_expert_distribution_metrics and (
|
561
590
|
self.expert_distribution_recorder_mode is None
|
@@ -575,6 +604,7 @@ class ServerArgs:
|
|
575
604
|
"Pipeline parallelism is incompatible with overlap schedule."
|
576
605
|
)
|
577
606
|
|
607
|
+
# Hicache
|
578
608
|
if self.hicache_storage_backend == "mooncake":
|
579
609
|
# to use mooncake storage backend, the following conditions must be met:
|
580
610
|
self.hicache_io_backend = "kernel"
|
@@ -609,6 +639,10 @@ class ServerArgs:
|
|
609
639
|
logger.warning(
|
610
640
|
"DeepSeek MTP does not require setting speculative_draft_model_path."
|
611
641
|
)
|
642
|
+
if self.page_size != 1 and self.attention_backend == "flashinfer":
|
643
|
+
raise ValueError(
|
644
|
+
"Speculative decoding with page_size != 1 is not supported. Please set page_size to 1."
|
645
|
+
)
|
612
646
|
|
613
647
|
# Auto choose parameters
|
614
648
|
if self.speculative_num_steps is None:
|
@@ -622,6 +656,16 @@ class ServerArgs:
|
|
622
656
|
self.speculative_num_draft_tokens,
|
623
657
|
) = auto_choose_speculative_params(self)
|
624
658
|
|
659
|
+
if (
|
660
|
+
self.attention_backend == "trtllm_mha"
|
661
|
+
or self.decode_attention_backend == "trtllm_mha"
|
662
|
+
or self.prefill_attention_backend == "trtllm_mha"
|
663
|
+
):
|
664
|
+
if self.speculative_eagle_topk > 1:
|
665
|
+
raise ValueError(
|
666
|
+
"trtllm_mha backend only supports topk = 1 for speculative decoding."
|
667
|
+
)
|
668
|
+
|
625
669
|
if (
|
626
670
|
self.speculative_eagle_topk == 1
|
627
671
|
and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
|
@@ -679,6 +723,12 @@ class ServerArgs:
|
|
679
723
|
"1" if self.disable_outlines_disk_cache else "0"
|
680
724
|
)
|
681
725
|
|
726
|
+
if self.enable_hierarchical_cache and self.disable_radix_cache:
|
727
|
+
raise ValueError(
|
728
|
+
"The arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive "
|
729
|
+
"and cannot be used at the same time. Please use only one of them."
|
730
|
+
)
|
731
|
+
|
682
732
|
@staticmethod
|
683
733
|
def add_cli_args(parser: argparse.ArgumentParser):
|
684
734
|
# Model and tokenizer
|
@@ -932,12 +982,6 @@ class ServerArgs:
|
|
932
982
|
default=ServerArgs.schedule_conservativeness,
|
933
983
|
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
|
934
984
|
)
|
935
|
-
parser.add_argument(
|
936
|
-
"--cpu-offload-gb",
|
937
|
-
type=int,
|
938
|
-
default=ServerArgs.cpu_offload_gb,
|
939
|
-
help="How many GBs of RAM to reserve for CPU offloading.",
|
940
|
-
)
|
941
985
|
parser.add_argument(
|
942
986
|
"--page-size",
|
943
987
|
type=int,
|
@@ -1130,6 +1174,12 @@ class ServerArgs:
|
|
1130
1174
|
default=ServerArgs.collect_tokens_histogram,
|
1131
1175
|
help="Collect prompt/generation tokens histogram.",
|
1132
1176
|
)
|
1177
|
+
parser.add_argument(
|
1178
|
+
"--gc-warning-threshold-secs",
|
1179
|
+
type=float,
|
1180
|
+
default=ServerArgs.gc_warning_threshold_secs,
|
1181
|
+
help="The threshold for long GC warning. If a GC takes longer than this, a warning will be logged. Set to 0 to disable.",
|
1182
|
+
)
|
1133
1183
|
parser.add_argument(
|
1134
1184
|
"--decode-log-interval",
|
1135
1185
|
type=int,
|
@@ -1162,6 +1212,12 @@ class ServerArgs:
|
|
1162
1212
|
default=ServerArgs.served_model_name,
|
1163
1213
|
help="Override the model name returned by the v1/models endpoint in OpenAI API server.",
|
1164
1214
|
)
|
1215
|
+
parser.add_argument(
|
1216
|
+
"--weight-version",
|
1217
|
+
type=str,
|
1218
|
+
default=ServerArgs.weight_version,
|
1219
|
+
help="Version identifier for the model weights. Defaults to 'default' if not specified.",
|
1220
|
+
)
|
1165
1221
|
parser.add_argument(
|
1166
1222
|
"--chat-template",
|
1167
1223
|
type=str,
|
@@ -1192,23 +1248,13 @@ class ServerArgs:
|
|
1192
1248
|
default=ServerArgs.reasoning_parser,
|
1193
1249
|
help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
|
1194
1250
|
)
|
1251
|
+
tool_call_parser_choices = list(FunctionCallParser.ToolCallParserEnum.keys())
|
1195
1252
|
parser.add_argument(
|
1196
1253
|
"--tool-call-parser",
|
1197
1254
|
type=str,
|
1198
|
-
choices=
|
1199
|
-
"qwen25",
|
1200
|
-
"mistral",
|
1201
|
-
"llama3",
|
1202
|
-
"deepseekv3",
|
1203
|
-
"pythonic",
|
1204
|
-
"kimi_k2",
|
1205
|
-
"qwen3_coder",
|
1206
|
-
"glm45",
|
1207
|
-
"step3",
|
1208
|
-
"gpt-oss",
|
1209
|
-
],
|
1255
|
+
choices=tool_call_parser_choices,
|
1210
1256
|
default=ServerArgs.tool_call_parser,
|
1211
|
-
help="Specify the parser for handling tool-call interactions. Options include:
|
1257
|
+
help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
|
1212
1258
|
)
|
1213
1259
|
parser.add_argument(
|
1214
1260
|
"--tool-server",
|
@@ -1293,7 +1339,7 @@ class ServerArgs:
|
|
1293
1339
|
nargs="*",
|
1294
1340
|
default=None,
|
1295
1341
|
action=LoRAPathAction,
|
1296
|
-
help=
|
1342
|
+
help='The list of LoRA adapters to load. Each adapter must be specified in one of the following formats: <PATH> | <NAME>=<PATH> | JSON with schema {"lora_name":str,"lora_path":str,"pinned":bool}',
|
1297
1343
|
)
|
1298
1344
|
parser.add_argument(
|
1299
1345
|
"--max-loras-per-batch",
|
@@ -1316,19 +1362,23 @@ class ServerArgs:
|
|
1316
1362
|
|
1317
1363
|
# Kernel backend
|
1318
1364
|
ATTN_BACKENDS = [
|
1319
|
-
|
1365
|
+
# Common
|
1366
|
+
"triton",
|
1367
|
+
"torch_native",
|
1368
|
+
# NVIDIA specific
|
1320
1369
|
"cutlass_mla",
|
1321
1370
|
"fa3",
|
1322
1371
|
"flashinfer",
|
1323
1372
|
"flashmla",
|
1324
|
-
"intel_amx",
|
1325
|
-
"torch_native",
|
1326
|
-
"ascend",
|
1327
|
-
"triton",
|
1328
1373
|
"trtllm_mla",
|
1329
1374
|
"trtllm_mha",
|
1330
1375
|
"dual_chunk_flash_attn",
|
1376
|
+
# AMD specific
|
1377
|
+
"aiter",
|
1331
1378
|
"wave",
|
1379
|
+
# Other platforms
|
1380
|
+
"intel_amx",
|
1381
|
+
"ascend",
|
1332
1382
|
]
|
1333
1383
|
parser.add_argument(
|
1334
1384
|
"--attention-backend",
|
@@ -1434,19 +1484,30 @@ class ServerArgs:
|
|
1434
1484
|
parser.add_argument(
|
1435
1485
|
"--moe-a2a-backend",
|
1436
1486
|
type=str,
|
1437
|
-
choices=["deepep"],
|
1487
|
+
choices=["none", "deepep"],
|
1438
1488
|
default=ServerArgs.moe_a2a_backend,
|
1439
1489
|
help="Choose the backend for MoE A2A.",
|
1440
1490
|
)
|
1441
1491
|
parser.add_argument(
|
1442
|
-
"--
|
1443
|
-
|
1444
|
-
|
1492
|
+
"--moe-runner-backend",
|
1493
|
+
type=str,
|
1494
|
+
choices=[
|
1495
|
+
"auto",
|
1496
|
+
"triton",
|
1497
|
+
"triton_kernel",
|
1498
|
+
"flashinfer_trtllm",
|
1499
|
+
"flashinfer_cutlass",
|
1500
|
+
"flashinfer_mxfp4",
|
1501
|
+
],
|
1502
|
+
default=ServerArgs.moe_runner_backend,
|
1503
|
+
help="Choose the runner backend for MoE.",
|
1445
1504
|
)
|
1446
1505
|
parser.add_argument(
|
1447
|
-
"--
|
1448
|
-
|
1449
|
-
|
1506
|
+
"--flashinfer-mxfp4-moe-precision",
|
1507
|
+
type=str,
|
1508
|
+
choices=["mxfp4", "bf16"],
|
1509
|
+
default=ServerArgs.flashinfer_mxfp4_moe_precision,
|
1510
|
+
help="Choose the computation precision of flashinfer mxfp4 moe",
|
1450
1511
|
)
|
1451
1512
|
parser.add_argument(
|
1452
1513
|
"--enable-flashinfer-allreduce-fusion",
|
@@ -1622,6 +1683,38 @@ class ServerArgs:
|
|
1622
1683
|
help="The type of heavy channels in double sparsity attention",
|
1623
1684
|
)
|
1624
1685
|
|
1686
|
+
# Offloading
|
1687
|
+
parser.add_argument(
|
1688
|
+
"--cpu-offload-gb",
|
1689
|
+
type=int,
|
1690
|
+
default=ServerArgs.cpu_offload_gb,
|
1691
|
+
help="How many GBs of RAM to reserve for CPU offloading.",
|
1692
|
+
)
|
1693
|
+
parser.add_argument(
|
1694
|
+
"--offload-group-size",
|
1695
|
+
type=int,
|
1696
|
+
default=ServerArgs.offload_group_size,
|
1697
|
+
help="Number of layers per group in offloading.",
|
1698
|
+
)
|
1699
|
+
parser.add_argument(
|
1700
|
+
"--offload-num-in-group",
|
1701
|
+
type=int,
|
1702
|
+
default=ServerArgs.offload_num_in_group,
|
1703
|
+
help="Number of layers to be offloaded within a group.",
|
1704
|
+
)
|
1705
|
+
parser.add_argument(
|
1706
|
+
"--offload-prefetch-step",
|
1707
|
+
type=int,
|
1708
|
+
default=ServerArgs.offload_prefetch_step,
|
1709
|
+
help="Steps to prefetch in offloading.",
|
1710
|
+
)
|
1711
|
+
parser.add_argument(
|
1712
|
+
"--offload-mode",
|
1713
|
+
type=str,
|
1714
|
+
default=ServerArgs.offload_mode,
|
1715
|
+
help="Mode of offloading.",
|
1716
|
+
)
|
1717
|
+
|
1625
1718
|
# Optimization/debug options
|
1626
1719
|
parser.add_argument(
|
1627
1720
|
"--disable-radix-cache",
|
@@ -1670,6 +1763,11 @@ class ServerArgs:
|
|
1670
1763
|
action="store_true",
|
1671
1764
|
help="Enable NCCL symmetric memory for fast collectives.",
|
1672
1765
|
)
|
1766
|
+
parser.add_argument(
|
1767
|
+
"--disable-flashinfer-cutlass-moe-fp4-allgather",
|
1768
|
+
action="store_true",
|
1769
|
+
help="Disables quantize before all-gather for flashinfer cutlass moe.",
|
1770
|
+
)
|
1673
1771
|
parser.add_argument(
|
1674
1772
|
"--enable-tokenizer-batch-encode",
|
1675
1773
|
action="store_true",
|
@@ -1813,16 +1911,6 @@ class ServerArgs:
|
|
1813
1911
|
action="store_true",
|
1814
1912
|
help="Enable returning hidden states with responses.",
|
1815
1913
|
)
|
1816
|
-
parser.add_argument(
|
1817
|
-
"--enable-triton-kernel-moe",
|
1818
|
-
action="store_true",
|
1819
|
-
help="Use triton moe grouped gemm kernel.",
|
1820
|
-
)
|
1821
|
-
parser.add_argument(
|
1822
|
-
"--enable-flashinfer-mxfp4-moe",
|
1823
|
-
action="store_true",
|
1824
|
-
help="Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
|
1825
|
-
)
|
1826
1914
|
parser.add_argument(
|
1827
1915
|
"--scheduler-recv-interval",
|
1828
1916
|
type=int,
|
@@ -1923,24 +2011,25 @@ class ServerArgs:
|
|
1923
2011
|
default=None,
|
1924
2012
|
help="The custom dataloader which used to update the model. Should be set with a valid import path, such as my_package.weight_load_func",
|
1925
2013
|
)
|
2014
|
+
parser.add_argument(
|
2015
|
+
"--weight-loader-disable-mmap",
|
2016
|
+
action="store_true",
|
2017
|
+
help="Disable mmap while loading weight using safetensors.",
|
2018
|
+
)
|
2019
|
+
|
2020
|
+
# For PD-Multiplexing
|
1926
2021
|
parser.add_argument(
|
1927
2022
|
"--enable-pdmux",
|
1928
2023
|
action="store_true",
|
1929
2024
|
help="Enable PD-Multiplexing, PD running on greenctx stream.",
|
1930
2025
|
)
|
1931
2026
|
|
1932
|
-
# For PD-Multiplexing
|
1933
2027
|
parser.add_argument(
|
1934
2028
|
"--sm-group-num",
|
1935
2029
|
type=int,
|
1936
2030
|
default=ServerArgs.sm_group_num,
|
1937
2031
|
help="Number of sm partition groups.",
|
1938
2032
|
)
|
1939
|
-
parser.add_argument(
|
1940
|
-
"--weight-loader-disable-mmap",
|
1941
|
-
action="store_true",
|
1942
|
-
help="Disable mmap while loading weight using safetensors.",
|
1943
|
-
)
|
1944
2033
|
|
1945
2034
|
# Deprecated arguments
|
1946
2035
|
parser.add_argument(
|
@@ -1953,6 +2042,26 @@ class ServerArgs:
|
|
1953
2042
|
action="store_true",
|
1954
2043
|
help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
|
1955
2044
|
)
|
2045
|
+
parser.add_argument(
|
2046
|
+
"--enable-flashinfer-cutlass-moe",
|
2047
|
+
action="store_true",
|
2048
|
+
help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
|
2049
|
+
)
|
2050
|
+
parser.add_argument(
|
2051
|
+
"--enable-flashinfer-trtllm-moe",
|
2052
|
+
action="store_true",
|
2053
|
+
help="(Deprecated) Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
|
2054
|
+
)
|
2055
|
+
parser.add_argument(
|
2056
|
+
"--enable-triton-kernel-moe",
|
2057
|
+
action="store_true",
|
2058
|
+
help="(Deprecated) Use triton moe grouped gemm kernel.",
|
2059
|
+
)
|
2060
|
+
parser.add_argument(
|
2061
|
+
"--enable-flashinfer-mxfp4-moe",
|
2062
|
+
action="store_true",
|
2063
|
+
help="(Deprecated) Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
|
2064
|
+
)
|
1956
2065
|
|
1957
2066
|
@classmethod
|
1958
2067
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -2037,28 +2146,42 @@ class ServerArgs:
|
|
2037
2146
|
)
|
2038
2147
|
|
2039
2148
|
if self.enable_lora:
|
2040
|
-
# Normalize lora_paths to a dictionary if it is a list.
|
2041
|
-
# TODO (lifuhuang): support specifying pinned adapters in server_args.
|
2042
2149
|
if isinstance(self.lora_paths, list):
|
2043
2150
|
lora_paths = self.lora_paths
|
2044
|
-
self.lora_paths =
|
2151
|
+
self.lora_paths = []
|
2045
2152
|
for lora_path in lora_paths:
|
2046
|
-
if
|
2047
|
-
|
2048
|
-
|
2049
|
-
|
2153
|
+
if isinstance(lora_path, str):
|
2154
|
+
if "=" in lora_path:
|
2155
|
+
name, path = lora_path.split("=", 1)
|
2156
|
+
lora_ref = LoRARef(
|
2157
|
+
lora_name=name, lora_path=path, pinned=False
|
2158
|
+
)
|
2159
|
+
else:
|
2160
|
+
lora_ref = LoRARef(
|
2161
|
+
lora_name=lora_path, lora_path=lora_path, pinned=False
|
2162
|
+
)
|
2163
|
+
elif isinstance(lora_path, dict):
|
2164
|
+
assert (
|
2165
|
+
"lora_name" in lora_path and "lora_path" in lora_path
|
2166
|
+
), f"When providing LoRA paths as a list of dict, each dict should contain 'lora_name' and 'lora_path' keys. Got: {lora_path}"
|
2167
|
+
lora_ref = LoRARef(
|
2168
|
+
lora_name=lora_path["lora_name"],
|
2169
|
+
lora_path=lora_path["lora_path"],
|
2170
|
+
pinned=lora_path.get("pinned", False),
|
2050
2171
|
)
|
2051
2172
|
else:
|
2052
|
-
|
2053
|
-
|
2173
|
+
raise ValueError(
|
2174
|
+
f"Invalid type for item in --lora-paths list: {type(lora_path)}. "
|
2175
|
+
"Expected a string or a dictionary."
|
2054
2176
|
)
|
2177
|
+
self.lora_paths.append(lora_ref)
|
2055
2178
|
elif isinstance(self.lora_paths, dict):
|
2056
|
-
self.lora_paths =
|
2057
|
-
|
2179
|
+
self.lora_paths = [
|
2180
|
+
LoRARef(lora_name=k, lora_path=v, pinned=False)
|
2058
2181
|
for k, v in self.lora_paths.items()
|
2059
|
-
|
2182
|
+
]
|
2060
2183
|
elif self.lora_paths is None:
|
2061
|
-
self.lora_paths =
|
2184
|
+
self.lora_paths = []
|
2062
2185
|
else:
|
2063
2186
|
raise ValueError(
|
2064
2187
|
f"Invalid type for --lora-paths: {type(self.lora_paths)}. "
|
@@ -2085,9 +2208,7 @@ class ServerArgs:
|
|
2085
2208
|
"max_loaded_loras should be greater than or equal to max_loras_per_batch. "
|
2086
2209
|
f"max_loaded_loras={self.max_loaded_loras}, max_loras_per_batch={self.max_loras_per_batch}"
|
2087
2210
|
)
|
2088
|
-
assert (
|
2089
|
-
not self.lora_paths or len(self.lora_paths) <= self.max_loaded_loras
|
2090
|
-
), (
|
2211
|
+
assert len(self.lora_paths) <= self.max_loaded_loras, (
|
2091
2212
|
"The number of LoRA paths should not exceed max_loaded_loras. "
|
2092
2213
|
f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
|
2093
2214
|
)
|
@@ -2105,11 +2226,26 @@ class ServerArgs:
|
|
2105
2226
|
model_arch = hf_config.architectures[0]
|
2106
2227
|
if model_arch in ["GptOssForCausalLM"]:
|
2107
2228
|
if self.attention_backend is None:
|
2108
|
-
|
2229
|
+
if is_cuda() and is_sm100_supported():
|
2230
|
+
self.attention_backend = "trtllm_mha"
|
2231
|
+
elif is_cuda() and is_sm90_supported():
|
2232
|
+
self.attention_backend = "fa3"
|
2233
|
+
else:
|
2234
|
+
self.attention_backend = "triton"
|
2109
2235
|
supported_backends = ["triton", "trtllm_mha", "fa3"]
|
2236
|
+
logger.info(
|
2237
|
+
f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
|
2238
|
+
)
|
2110
2239
|
assert (
|
2111
2240
|
self.attention_backend in supported_backends
|
2112
2241
|
), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
|
2242
|
+
|
2243
|
+
if is_sm100_supported():
|
2244
|
+
if not self.enable_dp_attention:
|
2245
|
+
self.enable_flashinfer_allreduce_fusion = True
|
2246
|
+
logger.info(
|
2247
|
+
"Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
|
2248
|
+
)
|
2113
2249
|
quantization_config = getattr(hf_config, "quantization_config", None)
|
2114
2250
|
is_mxfp4_quant_format = (
|
2115
2251
|
quantization_config is not None
|
@@ -2117,18 +2253,21 @@ class ServerArgs:
|
|
2117
2253
|
)
|
2118
2254
|
|
2119
2255
|
if is_sm100_supported() and is_mxfp4_quant_format:
|
2120
|
-
self.
|
2121
|
-
self.enable_triton_kernel_moe = False
|
2256
|
+
self.moe_runner_backend = "flashinfer_mxfp4"
|
2122
2257
|
logger.warning(
|
2123
2258
|
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
|
2124
2259
|
)
|
2125
2260
|
else:
|
2126
|
-
if self.
|
2261
|
+
if self.moe_runner_backend == "triton_kernel":
|
2127
2262
|
assert (
|
2128
2263
|
self.ep_size == 1
|
2129
2264
|
), "Triton kernel MoE is only supported when ep_size == 1"
|
2130
|
-
if
|
2131
|
-
self.
|
2265
|
+
if (
|
2266
|
+
self.moe_runner_backend == "auto"
|
2267
|
+
and self.ep_size == 1
|
2268
|
+
and is_triton_kernels_available()
|
2269
|
+
):
|
2270
|
+
self.moe_runner_backend = "triton_kernel"
|
2132
2271
|
logger.warning(
|
2133
2272
|
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
|
2134
2273
|
)
|
@@ -2137,7 +2276,10 @@ class ServerArgs:
|
|
2137
2276
|
# use bf16 for mxfp4 triton kernels
|
2138
2277
|
self.dtype = "bfloat16"
|
2139
2278
|
elif "Llama4" in model_arch:
|
2140
|
-
assert self.attention_backend
|
2279
|
+
assert self.attention_backend in {
|
2280
|
+
"fa3",
|
2281
|
+
"aiter",
|
2282
|
+
}, "fa3 or aiter is required for Llama4 model"
|
2141
2283
|
elif model_arch in [
|
2142
2284
|
"Gemma2ForCausalLM",
|
2143
2285
|
"Gemma3ForCausalLM",
|
@@ -2291,13 +2433,22 @@ class PortArgs:
|
|
2291
2433
|
|
2292
2434
|
class LoRAPathAction(argparse.Action):
|
2293
2435
|
def __call__(self, parser, namespace, values, option_string=None):
|
2294
|
-
|
2295
|
-
|
2296
|
-
|
2297
|
-
|
2298
|
-
|
2299
|
-
|
2300
|
-
|
2436
|
+
lora_paths = []
|
2437
|
+
if values:
|
2438
|
+
assert isinstance(values, list), "Expected a list of LoRA paths."
|
2439
|
+
for lora_path in values:
|
2440
|
+
lora_path = lora_path.strip()
|
2441
|
+
if lora_path.startswith("{") and lora_path.endswith("}"):
|
2442
|
+
obj = json.loads(lora_path)
|
2443
|
+
assert "lora_path" in obj and "lora_name" in obj, (
|
2444
|
+
f"{repr(lora_path)} looks like a JSON str, "
|
2445
|
+
"but it does not contain 'lora_name' and 'lora_path' keys."
|
2446
|
+
)
|
2447
|
+
lora_paths.append(obj)
|
2448
|
+
else:
|
2449
|
+
lora_paths.append(lora_path)
|
2450
|
+
|
2451
|
+
setattr(namespace, self.dest, lora_paths)
|
2301
2452
|
|
2302
2453
|
|
2303
2454
|
class DeprecatedAction(argparse.Action):
|
@@ -2310,6 +2461,10 @@ class DeprecatedAction(argparse.Action):
|
|
2310
2461
|
raise ValueError(self.help)
|
2311
2462
|
|
2312
2463
|
|
2464
|
+
def print_deprecated_warning(message: str):
|
2465
|
+
logger.warning(f"\033[33m{message}\033[0m")
|
2466
|
+
|
2467
|
+
|
2313
2468
|
def auto_choose_speculative_params(self: ServerArgs):
|
2314
2469
|
"""
|
2315
2470
|
Automatically choose the parameters for speculative decoding.
|
@@ -2322,8 +2477,12 @@ def auto_choose_speculative_params(self: ServerArgs):
|
|
2322
2477
|
if arch in ["LlamaForCausalLM"]:
|
2323
2478
|
# The default value for llama
|
2324
2479
|
return (5, 4, 8)
|
2325
|
-
elif arch in [
|
2326
|
-
|
2480
|
+
elif arch in [
|
2481
|
+
"DeepseekV3ForCausalLM",
|
2482
|
+
"DeepseekV2ForCausalLM",
|
2483
|
+
"GptOssForCausalLM",
|
2484
|
+
]:
|
2485
|
+
# The default value for deepseek and gpt-oss
|
2327
2486
|
return (3, 1, 4)
|
2328
2487
|
elif arch in ["Grok1ForCausalLM", "Grok1VForCausalLM"]:
|
2329
2488
|
return (5, 4, 8)
|