sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +10 -1
- sglang/bench_serving.py +257 -29
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +50 -6
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +48 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/xgrammar_backend.py +28 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +21 -10
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +5 -3
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +24 -3
- sglang/srt/entrypoints/engine.py +38 -17
- sglang/srt/entrypoints/grpc_request_manager.py +580 -0
- sglang/srt/entrypoints/grpc_server.py +680 -0
- sglang/srt/entrypoints/http_server.py +85 -54
- sglang/srt/entrypoints/openai/protocol.py +4 -1
- sglang/srt/entrypoints/openai/serving_base.py +46 -3
- sglang/srt/entrypoints/openai/serving_chat.py +36 -16
- sglang/srt/entrypoints/openai/serving_completions.py +12 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +8 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +6 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +6 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +142 -9
- sglang/srt/layers/attention/ascend_backend.py +11 -4
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashinfer_backend.py +6 -4
- sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
- sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +18 -1
- sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/dp_attention.py +30 -1
- sglang/srt/layers/layernorm.py +32 -15
- sglang/srt/layers/linear.py +34 -3
- sglang/srt/layers/logits_processor.py +29 -10
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +182 -62
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +12 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +76 -47
- sglang/srt/layers/quantization/fp8_utils.py +50 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +147 -47
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +64 -40
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +30 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +76 -38
- sglang/srt/layers/sampler.py +162 -18
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +158 -160
- sglang/srt/managers/data_parallel_controller.py +105 -35
- sglang/srt/managers/detokenizer_manager.py +8 -4
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +199 -12
- sglang/srt/managers/mm_utils.py +1 -0
- sglang/srt/managers/multi_tokenizer_mixin.py +350 -400
- sglang/srt/managers/schedule_batch.py +77 -56
- sglang/srt/managers/schedule_policy.py +1 -1
- sglang/srt/managers/scheduler.py +187 -39
- sglang/srt/managers/scheduler_metrics_mixin.py +4 -3
- sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
- sglang/srt/managers/tokenizer_manager.py +259 -519
- sglang/srt/managers/tp_worker.py +53 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
- sglang/srt/mem_cache/hicache_storage.py +3 -23
- sglang/srt/mem_cache/hiradix_cache.py +103 -43
- sglang/srt/mem_cache/memory_pool.py +347 -48
- sglang/srt/mem_cache/memory_pool_host.py +105 -46
- sglang/srt/mem_cache/radix_cache.py +0 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +86 -4
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +49 -7
- sglang/srt/mem_cache/swa_radix_cache.py +0 -2
- sglang/srt/metrics/collector.py +493 -76
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +59 -2
- sglang/srt/model_executor/model_runner.py +356 -29
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +128 -4
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +798 -218
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_v2.py +109 -15
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +1 -1
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/glm4v_moe.py +3 -0
- sglang/srt/models/gpt_oss.py +1 -1
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +2 -2
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +7 -0
- sglang/srt/models/qwen2_5_vl.py +27 -3
- sglang/srt/models/qwen2_moe.py +56 -12
- sglang/srt/models/qwen3_moe.py +1 -1
- sglang/srt/models/qwen3_next.py +1042 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/multimodal/processors/dots_vlm.py +99 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/multimodal/processors/qwen_vl.py +15 -5
- sglang/srt/offloader.py +27 -3
- sglang/srt/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +276 -35
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_utils.py +0 -2
- sglang/srt/speculative/eagle_worker.py +43 -4
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tracing/trace.py +552 -0
- sglang/srt/utils.py +34 -3
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_utils.py +28 -1
- sglang/utils.py +11 -0
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +237 -178
- sglang/srt/disaggregation/launch_lb.py +0 -118
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -19,10 +19,12 @@ import json
|
|
19
19
|
import logging
|
20
20
|
import os
|
21
21
|
import random
|
22
|
+
import socket
|
22
23
|
import sys
|
23
24
|
import tempfile
|
24
25
|
from typing import List, Literal, Optional, Union
|
25
26
|
|
27
|
+
from sglang.srt.connector import ConnectorType
|
26
28
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
27
29
|
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
28
30
|
from sglang.srt.lora.lora_registry import LoRARef
|
@@ -36,14 +38,18 @@ from sglang.srt.utils import (
|
|
36
38
|
is_cuda,
|
37
39
|
is_flashinfer_available,
|
38
40
|
is_hip,
|
41
|
+
is_npu,
|
39
42
|
is_port_available,
|
40
43
|
is_remote_url,
|
41
44
|
is_sm90_supported,
|
42
45
|
is_sm100_supported,
|
43
46
|
is_triton_kernels_available,
|
44
47
|
is_valid_ipv6_address,
|
48
|
+
json_list_type,
|
45
49
|
nullable_str,
|
50
|
+
parse_connector_type,
|
46
51
|
)
|
52
|
+
from sglang.utils import is_in_ci
|
47
53
|
|
48
54
|
logger = logging.getLogger(__name__)
|
49
55
|
|
@@ -60,6 +66,7 @@ LOAD_FORMAT_CHOICES = [
|
|
60
66
|
"bitsandbytes",
|
61
67
|
"layered",
|
62
68
|
"remote",
|
69
|
+
"remote_instance",
|
63
70
|
]
|
64
71
|
|
65
72
|
QUANTIZATION_CHOICES = [
|
@@ -94,6 +101,7 @@ ATTENTION_BACKEND_CHOICES = [
|
|
94
101
|
"trtllm_mla",
|
95
102
|
"trtllm_mha",
|
96
103
|
"dual_chunk_flash_attn",
|
104
|
+
"hybrid_linear_attn",
|
97
105
|
# AMD specific
|
98
106
|
"aiter",
|
99
107
|
"wave",
|
@@ -104,6 +112,8 @@ ATTENTION_BACKEND_CHOICES = [
|
|
104
112
|
|
105
113
|
DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
|
106
114
|
|
115
|
+
GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
|
116
|
+
|
107
117
|
|
108
118
|
# Allow external code to add more choices
|
109
119
|
def add_load_format_choices(choices):
|
@@ -122,6 +132,10 @@ def add_disagg_transfer_backend_choices(choices):
|
|
122
132
|
DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
|
123
133
|
|
124
134
|
|
135
|
+
def add_grammar_backend_choices(choices):
|
136
|
+
GRAMMAR_BACKEND_CHOICES.extend(choices)
|
137
|
+
|
138
|
+
|
125
139
|
@dataclasses.dataclass
|
126
140
|
class ServerArgs:
|
127
141
|
# Model and tokenizer
|
@@ -191,6 +205,8 @@ class ServerArgs:
|
|
191
205
|
show_time_cost: bool = False
|
192
206
|
enable_metrics: bool = False
|
193
207
|
enable_metrics_for_all_schedulers: bool = False
|
208
|
+
tokenizer_metrics_custom_labels_header: str = "x-customer-labels"
|
209
|
+
tokenizer_metrics_allowed_customer_labels: Optional[List[str]] = None
|
194
210
|
bucket_time_to_first_token: Optional[List[float]] = None
|
195
211
|
bucket_inter_token_latency: Optional[List[float]] = None
|
196
212
|
bucket_e2e_request_latency: Optional[List[float]] = None
|
@@ -201,6 +217,8 @@ class ServerArgs:
|
|
201
217
|
enable_request_time_stats_logging: bool = False
|
202
218
|
kv_events_config: Optional[str] = None
|
203
219
|
gc_warning_threshold_secs: float = 0.0
|
220
|
+
enable_trace: bool = False
|
221
|
+
oltp_traces_endpoint: str = "localhost:4317"
|
204
222
|
|
205
223
|
# API related
|
206
224
|
api_key: Optional[str] = None
|
@@ -217,6 +235,9 @@ class ServerArgs:
|
|
217
235
|
# Data parallelism
|
218
236
|
dp_size: int = 1
|
219
237
|
load_balance_method: str = "round_robin"
|
238
|
+
load_watch_interval: float = 0.1
|
239
|
+
# FIXME: remove this after dp rank scheduling is fully supported with PD-Disaggregation
|
240
|
+
prefill_round_robin_balance: bool = False
|
220
241
|
|
221
242
|
# Multi-node distributed serving
|
222
243
|
dist_init_addr: Optional[str] = None
|
@@ -249,12 +270,14 @@ class ServerArgs:
|
|
249
270
|
# Speculative decoding
|
250
271
|
speculative_algorithm: Optional[str] = None
|
251
272
|
speculative_draft_model_path: Optional[str] = None
|
273
|
+
speculative_draft_model_revision: Optional[str] = None
|
252
274
|
speculative_num_steps: Optional[int] = None
|
253
275
|
speculative_eagle_topk: Optional[int] = None
|
254
276
|
speculative_num_draft_tokens: Optional[int] = None
|
255
277
|
speculative_accept_threshold_single: float = 1.0
|
256
278
|
speculative_accept_threshold_acc: float = 1.0
|
257
279
|
speculative_token_map: Optional[str] = None
|
280
|
+
speculative_attention_mode: str = "prefill"
|
258
281
|
|
259
282
|
# Expert parallelism
|
260
283
|
ep_size: int = 1
|
@@ -296,6 +319,8 @@ class ServerArgs:
|
|
296
319
|
hicache_storage_backend: Optional[str] = None
|
297
320
|
hicache_storage_prefetch_policy: str = "best_effort"
|
298
321
|
hicache_storage_backend_extra_config: Optional[str] = None
|
322
|
+
# LMCache
|
323
|
+
enable_lmcache: bool = False
|
299
324
|
|
300
325
|
# Double Sparsity
|
301
326
|
enable_double_sparsity: bool = False
|
@@ -340,6 +365,7 @@ class ServerArgs:
|
|
340
365
|
enable_p2p_check: bool = False
|
341
366
|
triton_attention_reduce_in_fp32: bool = False
|
342
367
|
triton_attention_num_kv_splits: int = 8
|
368
|
+
triton_attention_split_tile_size: Optional[int] = None
|
343
369
|
num_continuous_decode_steps: int = 1
|
344
370
|
delete_ckpt_after_loading: bool = False
|
345
371
|
enable_memory_saver: bool = False
|
@@ -351,6 +377,12 @@ class ServerArgs:
|
|
351
377
|
disable_fast_image_processor: bool = False
|
352
378
|
enable_return_hidden_states: bool = False
|
353
379
|
scheduler_recv_interval: int = 1
|
380
|
+
numa_node: Optional[List[int]] = None
|
381
|
+
|
382
|
+
# Dynamic batch tokenizer
|
383
|
+
enable_dynamic_batch_tokenizer: bool = False
|
384
|
+
dynamic_batch_tokenizer_batch_size: int = 32
|
385
|
+
dynamic_batch_tokenizer_batch_timeout: float = 0.002
|
354
386
|
|
355
387
|
# Debug tensor dumps
|
356
388
|
debug_tensor_dump_output_folder: Optional[str] = None
|
@@ -359,7 +391,7 @@ class ServerArgs:
|
|
359
391
|
debug_tensor_dump_prefill_only: bool = False
|
360
392
|
|
361
393
|
# PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
|
362
|
-
disaggregation_mode:
|
394
|
+
disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
|
363
395
|
disaggregation_transfer_backend: str = "mooncake"
|
364
396
|
disaggregation_bootstrap_port: int = 8998
|
365
397
|
disaggregation_decode_tp: Optional[int] = None
|
@@ -367,20 +399,32 @@ class ServerArgs:
|
|
367
399
|
disaggregation_prefill_pp: Optional[int] = 1
|
368
400
|
disaggregation_ib_device: Optional[str] = None
|
369
401
|
num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
|
370
|
-
|
402
|
+
|
403
|
+
# FIXME: hack to reduce ITL when decode bs is small
|
404
|
+
disaggregation_decode_polling_interval: int = 1
|
371
405
|
|
372
406
|
# For model weight update
|
373
407
|
custom_weight_loader: Optional[List[str]] = None
|
374
408
|
weight_loader_disable_mmap: bool = False
|
375
409
|
|
410
|
+
# Remote instance weight loading
|
411
|
+
remote_instance_weight_loader_seed_instance_ip: Optional[str] = None
|
412
|
+
remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
|
413
|
+
remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None
|
414
|
+
|
376
415
|
# For PD-Multiplexing
|
377
416
|
enable_pdmux: bool = False
|
378
417
|
sm_group_num: int = 3
|
379
418
|
|
419
|
+
# Mamba cache
|
420
|
+
max_mamba_cache_size: Optional[int] = None
|
421
|
+
mamba_ssm_dtype: str = "float32"
|
422
|
+
|
380
423
|
# Deprecated arguments
|
381
424
|
enable_ep_moe: bool = False
|
382
425
|
enable_deepep_moe: bool = False
|
383
426
|
enable_flashinfer_cutlass_moe: bool = False
|
427
|
+
enable_flashinfer_cutedsl_moe: bool = False
|
384
428
|
enable_flashinfer_trtllm_moe: bool = False
|
385
429
|
enable_triton_kernel_moe: bool = False
|
386
430
|
enable_flashinfer_mxfp4_moe: bool = False
|
@@ -402,6 +446,11 @@ class ServerArgs:
|
|
402
446
|
print_deprecated_warning(
|
403
447
|
"NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
|
404
448
|
)
|
449
|
+
if self.enable_flashinfer_cutedsl_moe:
|
450
|
+
self.moe_runner_backend = "flashinfer_cutedsl"
|
451
|
+
print_deprecated_warning(
|
452
|
+
"NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead."
|
453
|
+
)
|
405
454
|
if self.enable_flashinfer_cutlass_moe:
|
406
455
|
self.moe_runner_backend = "flashinfer_cutlass"
|
407
456
|
print_deprecated_warning(
|
@@ -421,6 +470,7 @@ class ServerArgs:
|
|
421
470
|
# Set missing default values
|
422
471
|
if self.tokenizer_path is None:
|
423
472
|
self.tokenizer_path = self.model_path
|
473
|
+
|
424
474
|
if self.served_model_name is None:
|
425
475
|
self.served_model_name = self.model_path
|
426
476
|
if self.device is None:
|
@@ -464,9 +514,14 @@ class ServerArgs:
|
|
464
514
|
# B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
465
515
|
reserved_mem = 32 * 1024
|
466
516
|
|
517
|
+
# draft model and larger cuda graph buffers
|
467
518
|
if self.speculative_algorithm is not None:
|
468
|
-
|
469
|
-
|
519
|
+
if self.speculative_algorithm == "STANDALONE":
|
520
|
+
# Standalone speculative decoding needs more memory than other speculative
|
521
|
+
# decoding algorithms since the draft model is typically larger.
|
522
|
+
reserved_mem += 6 * 1024
|
523
|
+
else:
|
524
|
+
reserved_mem += 2 * 1024
|
470
525
|
if self.enable_dp_attention:
|
471
526
|
reserved_mem += 4 * 1024
|
472
527
|
|
@@ -509,7 +564,8 @@ class ServerArgs:
|
|
509
564
|
self.sampling_backend = "pytorch"
|
510
565
|
|
511
566
|
# Model-specific adjustments
|
512
|
-
self.
|
567
|
+
if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
|
568
|
+
self.model_specific_adjustments()
|
513
569
|
|
514
570
|
# Set kernel backends
|
515
571
|
if self.device == "cpu":
|
@@ -528,7 +584,7 @@ class ServerArgs:
|
|
528
584
|
)
|
529
585
|
self.disable_cuda_graph = True
|
530
586
|
|
531
|
-
if self.attention_backend
|
587
|
+
if is_npu() and self.attention_backend in ["ascend", "hybrid_linear_attn"]:
|
532
588
|
logger.warning(
|
533
589
|
"At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
|
534
590
|
)
|
@@ -608,12 +664,13 @@ class ServerArgs:
|
|
608
664
|
if self.grammar_backend is None:
|
609
665
|
self.grammar_backend = "xgrammar"
|
610
666
|
|
667
|
+
if self.dp_size == 1:
|
668
|
+
self.enable_dp_attention = False
|
669
|
+
self.enable_dp_lm_head = False
|
670
|
+
|
611
671
|
# Data parallelism attention
|
612
672
|
if self.enable_dp_attention:
|
613
673
|
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
614
|
-
assert (
|
615
|
-
self.dp_size > 1
|
616
|
-
), "Please set a dp-size > 1. You can use 1 < dp-size <= tp-size "
|
617
674
|
assert self.tp_size % self.dp_size == 0
|
618
675
|
self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
|
619
676
|
logger.warning(
|
@@ -636,11 +693,13 @@ class ServerArgs:
|
|
636
693
|
], "The expert parallel size must be 1 or the same as the tensor parallel size"
|
637
694
|
|
638
695
|
if self.moe_runner_backend == "flashinfer_trtllm":
|
639
|
-
|
640
|
-
self.
|
641
|
-
|
642
|
-
|
643
|
-
|
696
|
+
assert (
|
697
|
+
self.quantization == "modelopt_fp4" or self.quantization == "fp8"
|
698
|
+
), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
|
699
|
+
self.disable_shared_experts_fusion = True
|
700
|
+
logger.warning(
|
701
|
+
"FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
|
702
|
+
)
|
644
703
|
|
645
704
|
# DeepEP MoE
|
646
705
|
if self.moe_a2a_backend == "deepep":
|
@@ -690,12 +749,24 @@ class ServerArgs:
|
|
690
749
|
self.hicache_io_backend = "kernel"
|
691
750
|
self.hicache_mem_layout = "page_first"
|
692
751
|
|
752
|
+
if self.hicache_mem_layout == "page_first_direct":
|
753
|
+
if self.hicache_io_backend != "direct":
|
754
|
+
self.hicache_io_backend = "direct"
|
755
|
+
logger.warning(
|
756
|
+
"Page first direct layout only support direct io backend"
|
757
|
+
)
|
758
|
+
|
693
759
|
# Speculative Decoding
|
694
760
|
if self.speculative_algorithm == "NEXTN":
|
695
761
|
# NEXTN shares the same implementation of EAGLE
|
696
762
|
self.speculative_algorithm = "EAGLE"
|
697
763
|
|
698
|
-
if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
|
764
|
+
if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
|
765
|
+
if self.speculative_algorithm == "STANDALONE":
|
766
|
+
# TODO: support dp attention for standalone speculative decoding
|
767
|
+
assert (
|
768
|
+
self.enable_dp_attention is False
|
769
|
+
), "Currently standalone speculative decoding does not support dp attention."
|
699
770
|
if self.max_running_requests is None:
|
700
771
|
self.max_running_requests = 48
|
701
772
|
self.disable_overlap_schedule = True
|
@@ -711,7 +782,12 @@ class ServerArgs:
|
|
711
782
|
)
|
712
783
|
|
713
784
|
model_arch = self.get_hf_config().architectures[0]
|
714
|
-
if model_arch in [
|
785
|
+
if model_arch in [
|
786
|
+
"DeepseekV3ForCausalLM",
|
787
|
+
"Glm4MoeForCausalLM",
|
788
|
+
"BailingMoeForCausalLM",
|
789
|
+
"BailingMoeV2ForCausalLM",
|
790
|
+
]:
|
715
791
|
# Auto set draft_model_path DeepSeek-V3/R1
|
716
792
|
if self.speculative_draft_model_path is None:
|
717
793
|
self.speculative_draft_model_path = self.model_path
|
@@ -770,12 +846,19 @@ class ServerArgs:
|
|
770
846
|
) and check_gguf_file(self.model_path):
|
771
847
|
self.quantization = self.load_format = "gguf"
|
772
848
|
|
773
|
-
# Model loading
|
774
849
|
if is_remote_url(self.model_path):
|
775
850
|
self.load_format = "remote"
|
776
851
|
if self.custom_weight_loader is None:
|
777
852
|
self.custom_weight_loader = []
|
778
853
|
|
854
|
+
if self.load_format == "remote_instance":
|
855
|
+
if (
|
856
|
+
self.remote_instance_weight_loader_seed_instance_ip is None
|
857
|
+
or self.remote_instance_weight_loader_seed_instance_service_port is None
|
858
|
+
or self.remote_instance_weight_loader_send_weights_group_ports is None
|
859
|
+
):
|
860
|
+
self.load_format = "auto"
|
861
|
+
|
779
862
|
# PD disaggregation
|
780
863
|
if self.disaggregation_mode == "decode":
|
781
864
|
assert (
|
@@ -787,6 +870,13 @@ class ServerArgs:
|
|
787
870
|
|
788
871
|
self.disable_radix_cache = True
|
789
872
|
logger.warning("KV cache is forced as chunk cache for decode server")
|
873
|
+
|
874
|
+
if self.dp_size > 1 and not is_in_ci():
|
875
|
+
assert self.prefill_round_robin_balance, (
|
876
|
+
"Prefill round robin balance is required when dp size > 1. "
|
877
|
+
"Please make sure that the prefill instance is launched with `--load-balance-method round_robin`"
|
878
|
+
" and `--prefill-round-robin-balance` is set for decode server."
|
879
|
+
)
|
790
880
|
elif self.disaggregation_mode == "prefill":
|
791
881
|
if self.disaggregation_decode_tp is None:
|
792
882
|
self.disaggregation_decode_tp = self.tp_size
|
@@ -799,10 +889,19 @@ class ServerArgs:
|
|
799
889
|
self.disable_cuda_graph = True
|
800
890
|
logger.warning("Cuda graph is disabled for prefill server")
|
801
891
|
|
892
|
+
# Validation: prevent both tokenizer batching features from being enabled
|
893
|
+
if self.enable_tokenizer_batch_encode and self.enable_dynamic_batch_tokenizer:
|
894
|
+
raise ValueError(
|
895
|
+
"Cannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. "
|
896
|
+
"Please choose one tokenizer batching approach."
|
897
|
+
)
|
898
|
+
|
802
899
|
# Propagate env vars
|
803
900
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
804
901
|
"1" if self.enable_torch_compile else "0"
|
805
902
|
)
|
903
|
+
os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
|
904
|
+
|
806
905
|
# Set env var before grammar backends init
|
807
906
|
os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
|
808
907
|
"1" if self.disable_outlines_disk_cache else "0"
|
@@ -814,6 +913,14 @@ class ServerArgs:
|
|
814
913
|
"and cannot be used at the same time. Please use only one of them."
|
815
914
|
)
|
816
915
|
|
916
|
+
if (
|
917
|
+
not self.tokenizer_metrics_custom_labels_header
|
918
|
+
and self.tokenizer_metrics_allowed_customer_labels
|
919
|
+
):
|
920
|
+
raise ValueError(
|
921
|
+
"Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-customer-labels."
|
922
|
+
)
|
923
|
+
|
817
924
|
@staticmethod
|
818
925
|
def add_cli_args(parser: argparse.ArgumentParser):
|
819
926
|
# Model and tokenizer
|
@@ -825,16 +932,28 @@ class ServerArgs:
|
|
825
932
|
required=True,
|
826
933
|
)
|
827
934
|
parser.add_argument(
|
828
|
-
"--
|
935
|
+
"--remote-instance-weight-loader-seed-instance-ip",
|
829
936
|
type=str,
|
830
|
-
default=ServerArgs.
|
831
|
-
help="The
|
937
|
+
default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
|
938
|
+
help="The ip of the seed instance for loading weights from remote instance.",
|
832
939
|
)
|
833
940
|
parser.add_argument(
|
834
|
-
"--
|
941
|
+
"--remote-instance-weight-loader-seed-instance-service-port",
|
835
942
|
type=int,
|
836
|
-
default=ServerArgs.
|
837
|
-
help="The
|
943
|
+
default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
|
944
|
+
help="The service port of the seed instance for loading weights from remote instance.",
|
945
|
+
)
|
946
|
+
parser.add_argument(
|
947
|
+
"--remote-instance-weight-loader-send-weights-group-ports",
|
948
|
+
type=json_list_type,
|
949
|
+
default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
|
950
|
+
help="The communication group ports for loading weights from remote instance.",
|
951
|
+
)
|
952
|
+
parser.add_argument(
|
953
|
+
"--tokenizer-path",
|
954
|
+
type=str,
|
955
|
+
default=ServerArgs.tokenizer_path,
|
956
|
+
help="The path of the tokenizer.",
|
838
957
|
)
|
839
958
|
parser.add_argument(
|
840
959
|
"--tokenizer-mode",
|
@@ -845,6 +964,12 @@ class ServerArgs:
|
|
845
964
|
"tokenizer if available, and 'slow' will "
|
846
965
|
"always use the slow tokenizer.",
|
847
966
|
)
|
967
|
+
parser.add_argument(
|
968
|
+
"--tokenizer-worker-num",
|
969
|
+
type=int,
|
970
|
+
default=ServerArgs.tokenizer_worker_num,
|
971
|
+
help="The worker num of the tokenizer manager.",
|
972
|
+
)
|
848
973
|
parser.add_argument(
|
849
974
|
"--skip-tokenizer-init",
|
850
975
|
action="store_true",
|
@@ -1035,7 +1160,7 @@ class ServerArgs:
|
|
1035
1160
|
"--schedule-policy",
|
1036
1161
|
type=str,
|
1037
1162
|
default=ServerArgs.schedule_policy,
|
1038
|
-
choices=["lpm", "random", "fcfs", "dfs-weight", "lof"],
|
1163
|
+
choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
|
1039
1164
|
help="The scheduling policy of the requests.",
|
1040
1165
|
)
|
1041
1166
|
parser.add_argument(
|
@@ -1209,6 +1334,21 @@ class ServerArgs:
|
|
1209
1334
|
"to record request metrics separately. This is especially useful when dp_attention is enabled, as "
|
1210
1335
|
"otherwise all metrics appear to come from TP 0.",
|
1211
1336
|
)
|
1337
|
+
parser.add_argument(
|
1338
|
+
"--tokenizer-metrics-custom-labels-header",
|
1339
|
+
type=str,
|
1340
|
+
default=ServerArgs.tokenizer_metrics_custom_labels_header,
|
1341
|
+
help="Specify the HTTP header for passing customer labels for tokenizer metrics.",
|
1342
|
+
)
|
1343
|
+
parser.add_argument(
|
1344
|
+
"--tokenizer-metrics-allowed-customer-labels",
|
1345
|
+
type=str,
|
1346
|
+
nargs="+",
|
1347
|
+
default=ServerArgs.tokenizer_metrics_allowed_customer_labels,
|
1348
|
+
help="The customer labels allowed for tokenizer metrics. The labels are specified via a dict in "
|
1349
|
+
"'--tokenizer-metrics-custom-labels-header' field in HTTP requests, e.g., {'label1': 'value1', 'label2': "
|
1350
|
+
"'value2'} is allowed if '--tokenizer-metrics-allowed-labels label1 label2' is set.",
|
1351
|
+
)
|
1212
1352
|
parser.add_argument(
|
1213
1353
|
"--bucket-time-to-first-token",
|
1214
1354
|
type=float,
|
@@ -1280,6 +1420,17 @@ class ServerArgs:
|
|
1280
1420
|
default=None,
|
1281
1421
|
help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
|
1282
1422
|
)
|
1423
|
+
parser.add_argument(
|
1424
|
+
"--enable-trace",
|
1425
|
+
action="store_true",
|
1426
|
+
help="Enable opentelemetry trace",
|
1427
|
+
)
|
1428
|
+
parser.add_argument(
|
1429
|
+
"--oltp-traces-endpoint",
|
1430
|
+
type=str,
|
1431
|
+
default="localhost:4317",
|
1432
|
+
help="Config opentelemetry collector endpoint if --enable-trace is set. format: <ip>:<port>",
|
1433
|
+
)
|
1283
1434
|
|
1284
1435
|
# API related
|
1285
1436
|
parser.add_argument(
|
@@ -1364,6 +1515,18 @@ class ServerArgs:
|
|
1364
1515
|
"minimum_tokens",
|
1365
1516
|
],
|
1366
1517
|
)
|
1518
|
+
parser.add_argument(
|
1519
|
+
"--load-watch-interval",
|
1520
|
+
type=float,
|
1521
|
+
default=ServerArgs.load_watch_interval,
|
1522
|
+
help="The interval of load watching in seconds.",
|
1523
|
+
)
|
1524
|
+
parser.add_argument(
|
1525
|
+
"--prefill-round-robin-balance",
|
1526
|
+
default=ServerArgs.prefill_round_robin_balance,
|
1527
|
+
action="store_true",
|
1528
|
+
help="Prefill is round robin balanced. This is used to promise decode server can get the correct dp rank.",
|
1529
|
+
)
|
1367
1530
|
|
1368
1531
|
# Multi-node distributed serving
|
1369
1532
|
parser.add_argument(
|
@@ -1474,7 +1637,7 @@ class ServerArgs:
|
|
1474
1637
|
parser.add_argument(
|
1475
1638
|
"--grammar-backend",
|
1476
1639
|
type=str,
|
1477
|
-
choices=
|
1640
|
+
choices=GRAMMAR_BACKEND_CHOICES,
|
1478
1641
|
default=ServerArgs.grammar_backend,
|
1479
1642
|
help="Choose the backend for grammar-guided decoding.",
|
1480
1643
|
)
|
@@ -1490,14 +1653,23 @@ class ServerArgs:
|
|
1490
1653
|
parser.add_argument(
|
1491
1654
|
"--speculative-algorithm",
|
1492
1655
|
type=str,
|
1493
|
-
choices=["EAGLE", "EAGLE3", "NEXTN"],
|
1656
|
+
choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
|
1494
1657
|
help="Speculative algorithm.",
|
1495
1658
|
)
|
1496
1659
|
parser.add_argument(
|
1497
1660
|
"--speculative-draft-model-path",
|
1661
|
+
"--speculative-draft-model",
|
1498
1662
|
type=str,
|
1499
1663
|
help="The path of the draft model weights. This can be a local folder or a Hugging Face repo ID.",
|
1500
1664
|
)
|
1665
|
+
parser.add_argument(
|
1666
|
+
"--speculative-draft-model-revision",
|
1667
|
+
type=str,
|
1668
|
+
default=None,
|
1669
|
+
help="The specific draft model version to use. It can be a branch "
|
1670
|
+
"name, a tag name, or a commit id. If unspecified, will use "
|
1671
|
+
"the default version.",
|
1672
|
+
)
|
1501
1673
|
parser.add_argument(
|
1502
1674
|
"--speculative-num-steps",
|
1503
1675
|
type=int,
|
@@ -1534,6 +1706,13 @@ class ServerArgs:
|
|
1534
1706
|
help="The path of the draft model's small vocab table.",
|
1535
1707
|
default=ServerArgs.speculative_token_map,
|
1536
1708
|
)
|
1709
|
+
parser.add_argument(
|
1710
|
+
"--speculative-attention-mode",
|
1711
|
+
type=str,
|
1712
|
+
choices=["prefill", "decode"],
|
1713
|
+
help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
|
1714
|
+
default=ServerArgs.speculative_attention_mode,
|
1715
|
+
)
|
1537
1716
|
|
1538
1717
|
# Expert parallelism
|
1539
1718
|
parser.add_argument(
|
@@ -1561,6 +1740,7 @@ class ServerArgs:
|
|
1561
1740
|
"flashinfer_trtllm",
|
1562
1741
|
"flashinfer_cutlass",
|
1563
1742
|
"flashinfer_mxfp4",
|
1743
|
+
"flashinfer_cutedsl",
|
1564
1744
|
],
|
1565
1745
|
default=ServerArgs.moe_runner_backend,
|
1566
1746
|
help="Choose the runner backend for MoE.",
|
@@ -1568,7 +1748,7 @@ class ServerArgs:
|
|
1568
1748
|
parser.add_argument(
|
1569
1749
|
"--flashinfer-mxfp4-moe-precision",
|
1570
1750
|
type=str,
|
1571
|
-
choices=["
|
1751
|
+
choices=["default", "bf16"],
|
1572
1752
|
default=ServerArgs.flashinfer_mxfp4_moe_precision,
|
1573
1753
|
help="Choose the computation precision of flashinfer mxfp4 moe",
|
1574
1754
|
)
|
@@ -1661,6 +1841,21 @@ class ServerArgs:
|
|
1661
1841
|
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
1662
1842
|
)
|
1663
1843
|
|
1844
|
+
# Mamba Cache
|
1845
|
+
parser.add_argument(
|
1846
|
+
"--max-mamba-cache-size",
|
1847
|
+
type=int,
|
1848
|
+
default=ServerArgs.max_mamba_cache_size,
|
1849
|
+
help="The maximum size of the mamba cache.",
|
1850
|
+
)
|
1851
|
+
parser.add_argument(
|
1852
|
+
"--mamba-ssm-dtype",
|
1853
|
+
type=str,
|
1854
|
+
default=ServerArgs.mamba_ssm_dtype,
|
1855
|
+
choices=["float32", "bfloat16"],
|
1856
|
+
help="The data type of the SSM states in mamba cache.",
|
1857
|
+
)
|
1858
|
+
|
1664
1859
|
# Hierarchical cache
|
1665
1860
|
parser.add_argument(
|
1666
1861
|
"--enable-hierarchical-cache",
|
@@ -1696,7 +1891,7 @@ class ServerArgs:
|
|
1696
1891
|
parser.add_argument(
|
1697
1892
|
"--hicache-mem-layout",
|
1698
1893
|
type=str,
|
1699
|
-
choices=["layer_first", "page_first"],
|
1894
|
+
choices=["layer_first", "page_first", "page_first_direct"],
|
1700
1895
|
default=ServerArgs.hicache_mem_layout,
|
1701
1896
|
help="The layout of host memory pool for hierarchical cache.",
|
1702
1897
|
)
|
@@ -1720,6 +1915,12 @@ class ServerArgs:
|
|
1720
1915
|
default=ServerArgs.hicache_storage_backend_extra_config,
|
1721
1916
|
help="A dictionary in JSON string format containing extra configuration for the storage backend.",
|
1722
1917
|
)
|
1918
|
+
# LMCache
|
1919
|
+
parser.add_argument(
|
1920
|
+
"--enable-lmcache",
|
1921
|
+
action="store_true",
|
1922
|
+
help="Using LMCache as an alternative hierarchical cache solution",
|
1923
|
+
)
|
1723
1924
|
|
1724
1925
|
# Double Sparsity
|
1725
1926
|
parser.add_argument(
|
@@ -1933,6 +2134,12 @@ class ServerArgs:
|
|
1933
2134
|
default=ServerArgs.triton_attention_num_kv_splits,
|
1934
2135
|
help="The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8.",
|
1935
2136
|
)
|
2137
|
+
parser.add_argument(
|
2138
|
+
"--triton-attention-split-tile-size",
|
2139
|
+
type=int,
|
2140
|
+
default=ServerArgs.triton_attention_split_tile_size,
|
2141
|
+
help="The size of split KV tile in flash decoding Triton kernel. Used for deterministic inference.",
|
2142
|
+
)
|
1936
2143
|
parser.add_argument(
|
1937
2144
|
"--num-continuous-decode-steps",
|
1938
2145
|
type=int,
|
@@ -1992,6 +2199,12 @@ class ServerArgs:
|
|
1992
2199
|
default=ServerArgs.scheduler_recv_interval,
|
1993
2200
|
help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
|
1994
2201
|
)
|
2202
|
+
parser.add_argument(
|
2203
|
+
"--numa-node",
|
2204
|
+
type=int,
|
2205
|
+
nargs="+",
|
2206
|
+
help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
|
2207
|
+
)
|
1995
2208
|
|
1996
2209
|
# Debug tensor dumps
|
1997
2210
|
parser.add_argument(
|
@@ -2017,12 +2230,29 @@ class ServerArgs:
|
|
2017
2230
|
action="store_true",
|
2018
2231
|
help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
|
2019
2232
|
)
|
2233
|
+
parser.add_argument(
|
2234
|
+
"--enable-dynamic-batch-tokenizer",
|
2235
|
+
action="store_true",
|
2236
|
+
help="Enable async dynamic batch tokenizer for improved performance when multiple requests arrive concurrently.",
|
2237
|
+
)
|
2238
|
+
parser.add_argument(
|
2239
|
+
"--dynamic-batch-tokenizer-batch-size",
|
2240
|
+
type=int,
|
2241
|
+
default=ServerArgs.dynamic_batch_tokenizer_batch_size,
|
2242
|
+
help="[Only used if --enable-dynamic-batch-tokenizer is set] Maximum batch size for dynamic batch tokenizer.",
|
2243
|
+
)
|
2244
|
+
parser.add_argument(
|
2245
|
+
"--dynamic-batch-tokenizer-batch-timeout",
|
2246
|
+
type=float,
|
2247
|
+
default=ServerArgs.dynamic_batch_tokenizer_batch_timeout,
|
2248
|
+
help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
|
2249
|
+
)
|
2020
2250
|
|
2021
2251
|
# PD disaggregation
|
2022
2252
|
parser.add_argument(
|
2023
2253
|
"--disaggregation-mode",
|
2024
2254
|
type=str,
|
2025
|
-
default=
|
2255
|
+
default=ServerArgs.disaggregation_mode,
|
2026
2256
|
choices=["null", "prefill", "decode"],
|
2027
2257
|
help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
|
2028
2258
|
)
|
@@ -2072,10 +2302,10 @@ class ServerArgs:
|
|
2072
2302
|
help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
|
2073
2303
|
)
|
2074
2304
|
parser.add_argument(
|
2075
|
-
"--
|
2076
|
-
type=
|
2077
|
-
default=
|
2078
|
-
help="The
|
2305
|
+
"--disaggregation-decode-polling-interval",
|
2306
|
+
type=int,
|
2307
|
+
default=ServerArgs.disaggregation_decode_polling_interval,
|
2308
|
+
help="The interval to poll requests in decode server. Can be set to >1 to reduce the overhead of this.",
|
2079
2309
|
)
|
2080
2310
|
|
2081
2311
|
# Custom weight loader
|
@@ -2122,6 +2352,11 @@ class ServerArgs:
|
|
2122
2352
|
action="store_true",
|
2123
2353
|
help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
|
2124
2354
|
)
|
2355
|
+
parser.add_argument(
|
2356
|
+
"--enable-flashinfer-cutedsl-moe",
|
2357
|
+
action="store_true",
|
2358
|
+
help="(Deprecated) Enable FlashInfer CuteDSL MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
|
2359
|
+
)
|
2125
2360
|
parser.add_argument(
|
2126
2361
|
"--enable-flashinfer-trtllm-moe",
|
2127
2362
|
action="store_true",
|
@@ -2144,6 +2379,7 @@ class ServerArgs:
|
|
2144
2379
|
args.pp_size = args.pipeline_parallel_size
|
2145
2380
|
args.dp_size = args.data_parallel_size
|
2146
2381
|
args.ep_size = args.expert_parallel_size
|
2382
|
+
|
2147
2383
|
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
2148
2384
|
return cls(**{attr: getattr(args, attr) for attr in attrs})
|
2149
2385
|
|
@@ -2200,7 +2436,8 @@ class ServerArgs:
|
|
2200
2436
|
|
2201
2437
|
# Check chunked prefill
|
2202
2438
|
# Skip validation if chunked prefill is disabled (i.e., size <= 0).
|
2203
|
-
if
|
2439
|
+
# Skip validation if disaggregation mode is decode.
|
2440
|
+
if self.chunked_prefill_size > 0 and self.disaggregation_mode != "decode":
|
2204
2441
|
assert (
|
2205
2442
|
self.chunked_prefill_size % self.page_size == 0
|
2206
2443
|
), "chunked_prefill_size must be divisible by page_size"
|
@@ -2612,7 +2849,9 @@ def auto_choose_speculative_params(self: ServerArgs):
|
|
2612
2849
|
"""
|
2613
2850
|
hf_config = self.get_hf_config()
|
2614
2851
|
arch = hf_config.architectures[0]
|
2615
|
-
|
2852
|
+
if self.speculative_algorithm == "STANDALONE":
|
2853
|
+
# The default value for standalone speculative decoding
|
2854
|
+
return (3, 1, 4)
|
2616
2855
|
if arch in ["LlamaForCausalLM"]:
|
2617
2856
|
# The default value for llama
|
2618
2857
|
return (5, 4, 8)
|
@@ -2620,6 +2859,8 @@ def auto_choose_speculative_params(self: ServerArgs):
|
|
2620
2859
|
"DeepseekV3ForCausalLM",
|
2621
2860
|
"DeepseekV2ForCausalLM",
|
2622
2861
|
"GptOssForCausalLM",
|
2862
|
+
"BailingMoeForCausalLM",
|
2863
|
+
"BailingMoeV2ForCausalLM",
|
2623
2864
|
]:
|
2624
2865
|
# The default value for deepseek and gpt-oss
|
2625
2866
|
return (3, 1, 4)
|