sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +1 -1
- sglang/bench_offline_throughput.py +19 -0
- sglang/bench_one_batch.py +2 -2
- sglang/bench_serving.py +123 -79
- sglang/global_config.py +8 -3
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/ir.py +1 -1
- sglang/srt/_custom_ops.py +83 -91
- sglang/srt/configs/load_config.py +4 -1
- sglang/srt/configs/model_config.py +48 -2
- sglang/srt/configs/qwen2_5_vl_config.py +5 -2
- sglang/srt/constrained/base_grammar_backend.py +117 -15
- sglang/srt/constrained/llguidance_backend.py +151 -0
- sglang/srt/constrained/outlines_backend.py +24 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -38
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
- sglang/srt/distributed/parallel_state.py +48 -3
- sglang/srt/entrypoints/engine.py +67 -9
- sglang/srt/entrypoints/http_server.py +190 -41
- sglang/srt/entrypoints/verl_engine.py +147 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/activation.py +11 -0
- sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
- sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_backend.py +220 -378
- sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
- sglang/srt/layers/attention/torch_native_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +9 -6
- sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
- sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
- sglang/srt/layers/attention/utils.py +39 -0
- sglang/srt/layers/attention/vision.py +60 -63
- sglang/srt/layers/dp_attention.py +142 -1
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/linear.py +3 -1
- sglang/srt/layers/logits_processor.py +281 -45
- sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
- sglang/srt/layers/moe/ep_moe/layer.py +140 -28
- sglang/srt/layers/moe/fused_moe_native.py +2 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
- sglang/srt/layers/moe/topk.py +13 -4
- sglang/srt/layers/quantization/__init__.py +111 -7
- sglang/srt/layers/quantization/blockwise_int8.py +409 -0
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/fp8.py +69 -28
- sglang/srt/layers/quantization/fp8_utils.py +17 -1
- sglang/srt/layers/quantization/gptq.py +416 -0
- sglang/srt/layers/quantization/int8_kernel.py +327 -0
- sglang/srt/layers/quantization/int8_utils.py +73 -0
- sglang/srt/layers/quantization/modelopt_quant.py +18 -1
- sglang/srt/layers/radix_attention.py +1 -0
- sglang/srt/layers/rotary_embedding.py +0 -1
- sglang/srt/layers/sampler.py +76 -31
- sglang/srt/layers/vocab_parallel_embedding.py +14 -13
- sglang/srt/lora/lora.py +17 -1
- sglang/srt/lora/lora_config.py +5 -0
- sglang/srt/lora/lora_manager.py +1 -3
- sglang/srt/managers/cache_controller.py +193 -62
- sglang/srt/managers/configure_logging.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +6 -2
- sglang/srt/managers/detokenizer_manager.py +124 -102
- sglang/srt/managers/image_processor.py +2 -1
- sglang/srt/managers/io_struct.py +143 -6
- sglang/srt/managers/schedule_batch.py +237 -197
- sglang/srt/managers/schedule_policy.py +29 -29
- sglang/srt/managers/scheduler.py +681 -259
- sglang/srt/managers/session_controller.py +6 -2
- sglang/srt/managers/tokenizer_manager.py +224 -68
- sglang/srt/managers/tp_worker.py +15 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
- sglang/srt/mem_cache/chunk_cache.py +18 -11
- sglang/srt/mem_cache/hiradix_cache.py +394 -0
- sglang/srt/mem_cache/memory_pool.py +44 -18
- sglang/srt/mem_cache/radix_cache.py +58 -47
- sglang/srt/metrics/collector.py +94 -36
- sglang/srt/model_executor/cuda_graph_runner.py +55 -24
- sglang/srt/model_executor/forward_batch_info.py +49 -16
- sglang/srt/model_executor/model_runner.py +208 -28
- sglang/srt/model_loader/loader.py +3 -3
- sglang/srt/model_loader/weight_utils.py +36 -14
- sglang/srt/models/baichuan.py +31 -6
- sglang/srt/models/chatglm.py +39 -7
- sglang/srt/models/commandr.py +29 -5
- sglang/srt/models/dbrx.py +31 -5
- sglang/srt/models/deepseek.py +43 -6
- sglang/srt/models/deepseek_nextn.py +32 -19
- sglang/srt/models/deepseek_v2.py +265 -32
- sglang/srt/models/exaone.py +19 -9
- sglang/srt/models/gemma.py +22 -8
- sglang/srt/models/gemma2.py +25 -12
- sglang/srt/models/gemma2_reward.py +5 -1
- sglang/srt/models/gpt2.py +28 -13
- sglang/srt/models/gpt_bigcode.py +27 -5
- sglang/srt/models/granite.py +21 -9
- sglang/srt/models/grok.py +21 -4
- sglang/srt/models/internlm2.py +36 -6
- sglang/srt/models/internlm2_reward.py +5 -1
- sglang/srt/models/llama.py +26 -9
- sglang/srt/models/llama_classification.py +5 -1
- sglang/srt/models/llama_eagle.py +17 -4
- sglang/srt/models/llama_embedding.py +5 -1
- sglang/srt/models/llama_reward.py +7 -2
- sglang/srt/models/llava.py +19 -3
- sglang/srt/models/llavavid.py +10 -1
- sglang/srt/models/minicpm.py +26 -2
- sglang/srt/models/minicpm3.py +39 -3
- sglang/srt/models/minicpmv.py +45 -14
- sglang/srt/models/mixtral.py +20 -9
- sglang/srt/models/mixtral_quant.py +50 -8
- sglang/srt/models/mllama.py +57 -11
- sglang/srt/models/olmo.py +34 -6
- sglang/srt/models/olmo2.py +34 -13
- sglang/srt/models/olmoe.py +26 -4
- sglang/srt/models/phi3_small.py +29 -10
- sglang/srt/models/qwen.py +26 -3
- sglang/srt/models/qwen2.py +26 -4
- sglang/srt/models/qwen2_5_vl.py +46 -8
- sglang/srt/models/qwen2_eagle.py +17 -5
- sglang/srt/models/qwen2_moe.py +44 -6
- sglang/srt/models/qwen2_rm.py +78 -0
- sglang/srt/models/qwen2_vl.py +39 -8
- sglang/srt/models/stablelm.py +32 -5
- sglang/srt/models/torch_native_llama.py +5 -2
- sglang/srt/models/xverse.py +21 -9
- sglang/srt/models/xverse_moe.py +45 -7
- sglang/srt/models/yivl.py +2 -1
- sglang/srt/openai_api/adapter.py +109 -24
- sglang/srt/openai_api/protocol.py +17 -1
- sglang/srt/reasoning_parser.py +154 -0
- sglang/srt/sampling/penaltylib/__init__.py +4 -6
- sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
- sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
- sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
- sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
- sglang/srt/sampling/sampling_batch_info.py +79 -157
- sglang/srt/sampling/sampling_params.py +16 -13
- sglang/srt/server_args.py +136 -52
- sglang/srt/speculative/build_eagle_tree.py +2 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
- sglang/srt/speculative/eagle_utils.py +92 -58
- sglang/srt/speculative/eagle_worker.py +186 -94
- sglang/srt/speculative/spec_info.py +1 -13
- sglang/srt/utils.py +43 -17
- sglang/srt/warmup.py +47 -0
- sglang/test/few_shot_gsm8k.py +4 -1
- sglang/test/runners.py +389 -126
- sglang/test/send_one.py +88 -0
- sglang/test/test_block_fp8_ep.py +361 -0
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +138 -84
- sglang/utils.py +50 -60
- sglang/version.py +1 -1
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/METADATA +21 -15
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/RECORD +200 -166
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/WHEEL +1 -1
- sglang/bench_latency.py +0 -1
- sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
- sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
- sglang/test/srt/sampling/penaltylib/utils.py +0 -344
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/LICENSE +0 -0
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -23,6 +23,7 @@ from typing import List, Optional
|
|
23
23
|
import torch
|
24
24
|
|
25
25
|
from sglang.srt.hf_transformers_utils import check_gguf_file
|
26
|
+
from sglang.srt.reasoning_parser import ReasoningParser
|
26
27
|
from sglang.srt.utils import (
|
27
28
|
get_amdgpu_memory_capacity,
|
28
29
|
get_hpu_memory_capacity,
|
@@ -43,19 +44,19 @@ class ServerArgs:
|
|
43
44
|
model_path: str
|
44
45
|
tokenizer_path: Optional[str] = None
|
45
46
|
tokenizer_mode: str = "auto"
|
47
|
+
skip_tokenizer_init: bool = False
|
46
48
|
load_format: str = "auto"
|
47
|
-
trust_remote_code: bool =
|
49
|
+
trust_remote_code: bool = False
|
48
50
|
dtype: str = "auto"
|
49
51
|
kv_cache_dtype: str = "auto"
|
50
|
-
quantization_param_path: nullable_str = None
|
51
52
|
quantization: Optional[str] = None
|
53
|
+
quantization_param_path: nullable_str = None
|
52
54
|
context_length: Optional[int] = None
|
53
55
|
device: str = "cuda"
|
54
56
|
served_model_name: Optional[str] = None
|
55
57
|
chat_template: Optional[str] = None
|
56
58
|
is_embedding: bool = False
|
57
59
|
revision: Optional[str] = None
|
58
|
-
skip_tokenizer_init: bool = False
|
59
60
|
|
60
61
|
# Port for the HTTP server
|
61
62
|
host: str = "127.0.0.1"
|
@@ -67,7 +68,7 @@ class ServerArgs:
|
|
67
68
|
max_total_tokens: Optional[int] = None
|
68
69
|
chunked_prefill_size: Optional[int] = None
|
69
70
|
max_prefill_tokens: int = 16384
|
70
|
-
schedule_policy: str = "
|
71
|
+
schedule_policy: str = "fcfs"
|
71
72
|
schedule_conservativeness: float = 1.0
|
72
73
|
cpu_offload_gb: int = 0
|
73
74
|
prefill_only_one_req: bool = False
|
@@ -79,21 +80,25 @@ class ServerArgs:
|
|
79
80
|
random_seed: Optional[int] = None
|
80
81
|
constrained_json_whitespace_pattern: Optional[str] = None
|
81
82
|
watchdog_timeout: float = 300
|
83
|
+
dist_timeout: Optional[int] = None # timeout for torch.distributed
|
82
84
|
download_dir: Optional[str] = None
|
83
85
|
base_gpu_id: int = 0
|
86
|
+
gpu_id_step: int = 1
|
84
87
|
|
85
88
|
# Logging
|
86
89
|
log_level: str = "info"
|
87
90
|
log_level_http: Optional[str] = None
|
88
91
|
log_requests: bool = False
|
92
|
+
log_requests_level: int = 0
|
89
93
|
show_time_cost: bool = False
|
90
94
|
enable_metrics: bool = False
|
91
95
|
decode_log_interval: int = 40
|
92
96
|
|
93
97
|
# API related
|
94
98
|
api_key: Optional[str] = None
|
95
|
-
|
99
|
+
file_storage_path: str = "sglang_storage"
|
96
100
|
enable_cache_report: bool = False
|
101
|
+
reasoning_parser: Optional[str] = None
|
97
102
|
|
98
103
|
# Data parallelism
|
99
104
|
dp_size: int = 1
|
@@ -121,11 +126,14 @@ class ServerArgs:
|
|
121
126
|
grammar_backend: Optional[str] = "outlines"
|
122
127
|
|
123
128
|
# Speculative decoding
|
124
|
-
speculative_draft_model_path: Optional[str] = None
|
125
129
|
speculative_algorithm: Optional[str] = None
|
130
|
+
speculative_draft_model_path: Optional[str] = None
|
126
131
|
speculative_num_steps: int = 5
|
127
|
-
|
128
|
-
|
132
|
+
speculative_eagle_topk: int = 4
|
133
|
+
speculative_num_draft_tokens: int = 8
|
134
|
+
speculative_accept_threshold_single: float = 1.0
|
135
|
+
speculative_accept_threshold_acc: float = 1.0
|
136
|
+
speculative_token_map: Optional[str] = None
|
129
137
|
|
130
138
|
# Double Sparsity
|
131
139
|
enable_double_sparsity: bool = False
|
@@ -137,7 +145,6 @@ class ServerArgs:
|
|
137
145
|
|
138
146
|
# Optimization/debug options
|
139
147
|
disable_radix_cache: bool = False
|
140
|
-
disable_jump_forward: bool = False
|
141
148
|
disable_cuda_graph: bool = False
|
142
149
|
disable_cuda_graph_padding: bool = False
|
143
150
|
enable_nccl_nvls: bool = False
|
@@ -161,14 +168,17 @@ class ServerArgs:
|
|
161
168
|
delete_ckpt_after_loading: bool = False
|
162
169
|
enable_memory_saver: bool = False
|
163
170
|
allow_auto_truncate: bool = False
|
164
|
-
return_hidden_states: bool = False
|
165
|
-
|
166
|
-
# Custom logit processor
|
167
171
|
enable_custom_logit_processor: bool = False
|
168
172
|
tool_call_parser: str = None
|
169
173
|
enable_hierarchical_cache: bool = False
|
170
|
-
|
171
174
|
enable_flashinfer_mla: bool = False
|
175
|
+
flashinfer_mla_disable_ragged: bool = False
|
176
|
+
warmups: Optional[str] = None
|
177
|
+
|
178
|
+
# Debug tensor dumps
|
179
|
+
debug_tensor_dump_output_folder: Optional[str] = None
|
180
|
+
debug_tensor_dump_input_file: Optional[str] = None
|
181
|
+
debug_tensor_dump_inject: bool = False
|
172
182
|
|
173
183
|
def __post_init__(self):
|
174
184
|
# Set missing default values
|
@@ -262,18 +272,24 @@ class ServerArgs:
|
|
262
272
|
)
|
263
273
|
|
264
274
|
# Speculative Decoding
|
265
|
-
if
|
266
|
-
|
267
|
-
|
268
|
-
|
275
|
+
if self.speculative_algorithm == "NEXTN":
|
276
|
+
# NEXTN shares the same implementation of EAGLE
|
277
|
+
self.speculative_algorithm = "EAGLE"
|
278
|
+
|
279
|
+
if self.speculative_algorithm == "EAGLE":
|
280
|
+
self.disable_overlap_schedule = True
|
269
281
|
self.prefill_only_one_req = True
|
270
282
|
self.disable_cuda_graph_padding = True
|
271
|
-
self.
|
272
|
-
|
273
|
-
self.chunked_prefill_size = -1
|
283
|
+
if self.max_running_requests is None:
|
284
|
+
self.max_running_requests = 32
|
274
285
|
logger.info(
|
275
|
-
|
286
|
+
"Overlap scheduler are disabled because of using "
|
287
|
+
"eagle speculative decoding."
|
288
|
+
"Max running request set to 32 because of using eagle speculative decoding."
|
276
289
|
)
|
290
|
+
# The token generated from the verify step is counted.
|
291
|
+
# If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
|
292
|
+
assert self.speculative_num_steps < self.speculative_num_draft_tokens
|
277
293
|
|
278
294
|
# GGUF
|
279
295
|
if (
|
@@ -377,15 +393,6 @@ class ServerArgs:
|
|
377
393
|
choices=["auto", "fp8_e5m2", "fp8_e4m3"],
|
378
394
|
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
|
379
395
|
)
|
380
|
-
parser.add_argument(
|
381
|
-
"--quantization-param-path",
|
382
|
-
type=nullable_str,
|
383
|
-
default=None,
|
384
|
-
help="Path to the JSON file containing the KV cache "
|
385
|
-
"scaling factors. This should generally be supplied, when "
|
386
|
-
"KV cache dtype is FP8. Otherwise, KV cache scaling factors "
|
387
|
-
"default to 1.0, which may cause accuracy issues. ",
|
388
|
-
)
|
389
396
|
parser.add_argument(
|
390
397
|
"--quantization",
|
391
398
|
type=str,
|
@@ -404,6 +411,15 @@ class ServerArgs:
|
|
404
411
|
],
|
405
412
|
help="The quantization method.",
|
406
413
|
)
|
414
|
+
parser.add_argument(
|
415
|
+
"--quantization-param-path",
|
416
|
+
type=nullable_str,
|
417
|
+
default=None,
|
418
|
+
help="Path to the JSON file containing the KV cache "
|
419
|
+
"scaling factors. This should generally be supplied, when "
|
420
|
+
"KV cache dtype is FP8. Otherwise, KV cache scaling factors "
|
421
|
+
"default to 1.0, which may cause accuracy issues. ",
|
422
|
+
)
|
407
423
|
parser.add_argument(
|
408
424
|
"--context-length",
|
409
425
|
type=int,
|
@@ -537,11 +553,17 @@ class ServerArgs:
|
|
537
553
|
default=ServerArgs.watchdog_timeout,
|
538
554
|
help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
|
539
555
|
)
|
556
|
+
parser.add_argument(
|
557
|
+
"--dist-timeout",
|
558
|
+
type=int,
|
559
|
+
default=ServerArgs.dist_timeout,
|
560
|
+
help="Set timeout for torch.distributed initialization.",
|
561
|
+
)
|
540
562
|
parser.add_argument(
|
541
563
|
"--download-dir",
|
542
564
|
type=str,
|
543
565
|
default=ServerArgs.download_dir,
|
544
|
-
help="Model download directory.",
|
566
|
+
help="Model download directory for huggingface.",
|
545
567
|
)
|
546
568
|
parser.add_argument(
|
547
569
|
"--base-gpu-id",
|
@@ -549,6 +571,12 @@ class ServerArgs:
|
|
549
571
|
default=ServerArgs.base_gpu_id,
|
550
572
|
help="The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine.",
|
551
573
|
)
|
574
|
+
parser.add_argument(
|
575
|
+
"--gpu-id-step",
|
576
|
+
type=int,
|
577
|
+
default=ServerArgs.gpu_id_step,
|
578
|
+
help="The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...",
|
579
|
+
)
|
552
580
|
|
553
581
|
# Logging
|
554
582
|
parser.add_argument(
|
@@ -566,7 +594,14 @@ class ServerArgs:
|
|
566
594
|
parser.add_argument(
|
567
595
|
"--log-requests",
|
568
596
|
action="store_true",
|
569
|
-
help="Log
|
597
|
+
help="Log metadata, inputs, outputs of all requests. The verbosity is decided by --log-requests-level",
|
598
|
+
)
|
599
|
+
parser.add_argument(
|
600
|
+
"--log-requests-level",
|
601
|
+
type=int,
|
602
|
+
default=0,
|
603
|
+
help="0: Log metadata. 1. Log metadata and partial input/output. 2. Log every input/output.",
|
604
|
+
choices=[0, 1, 2],
|
570
605
|
)
|
571
606
|
parser.add_argument(
|
572
607
|
"--show-time-cost",
|
@@ -593,9 +628,9 @@ class ServerArgs:
|
|
593
628
|
help="Set API key of the server. It is also used in the OpenAI API compatible server.",
|
594
629
|
)
|
595
630
|
parser.add_argument(
|
596
|
-
"--file-storage-
|
631
|
+
"--file-storage-path",
|
597
632
|
type=str,
|
598
|
-
default=ServerArgs.
|
633
|
+
default=ServerArgs.file_storage_path,
|
599
634
|
help="The path of the file storage in backend.",
|
600
635
|
)
|
601
636
|
parser.add_argument(
|
@@ -603,6 +638,13 @@ class ServerArgs:
|
|
603
638
|
action="store_true",
|
604
639
|
help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
|
605
640
|
)
|
641
|
+
parser.add_argument(
|
642
|
+
"--reasoning-parser",
|
643
|
+
type=str,
|
644
|
+
choices=list(ReasoningParser.DetectorMap.keys()),
|
645
|
+
default=ServerArgs.reasoning_parser,
|
646
|
+
help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
|
647
|
+
)
|
606
648
|
|
607
649
|
# Data parallelism
|
608
650
|
parser.add_argument(
|
@@ -694,7 +736,7 @@ class ServerArgs:
|
|
694
736
|
parser.add_argument(
|
695
737
|
"--grammar-backend",
|
696
738
|
type=str,
|
697
|
-
choices=["xgrammar", "outlines"],
|
739
|
+
choices=["xgrammar", "outlines", "llguidance"],
|
698
740
|
default=ServerArgs.grammar_backend,
|
699
741
|
help="Choose the backend for grammar-guided decoding.",
|
700
742
|
)
|
@@ -703,6 +745,11 @@ class ServerArgs:
|
|
703
745
|
action="store_true",
|
704
746
|
help="Enable FlashInfer MLA optimization",
|
705
747
|
)
|
748
|
+
parser.add_argument(
|
749
|
+
"--flashinfer-mla-disable-ragged",
|
750
|
+
action="store_true",
|
751
|
+
help="Not using ragged prefill wrapper when running flashinfer mla",
|
752
|
+
)
|
706
753
|
|
707
754
|
# Speculative decoding
|
708
755
|
parser.add_argument(
|
@@ -722,18 +769,36 @@ class ServerArgs:
|
|
722
769
|
help="The number of steps sampled from draft model in Speculative Decoding.",
|
723
770
|
default=ServerArgs.speculative_num_steps,
|
724
771
|
)
|
772
|
+
parser.add_argument(
|
773
|
+
"--speculative-eagle-topk",
|
774
|
+
type=int,
|
775
|
+
help="The number of tokens sampled from the draft model in eagle2 each step.",
|
776
|
+
choices=[1, 2, 4, 8],
|
777
|
+
default=ServerArgs.speculative_eagle_topk,
|
778
|
+
)
|
725
779
|
parser.add_argument(
|
726
780
|
"--speculative-num-draft-tokens",
|
727
781
|
type=int,
|
728
|
-
help="The number of
|
782
|
+
help="The number of tokens sampled from the draft model in Speculative Decoding.",
|
729
783
|
default=ServerArgs.speculative_num_draft_tokens,
|
730
784
|
)
|
731
785
|
parser.add_argument(
|
732
|
-
"--speculative-
|
733
|
-
type=
|
734
|
-
help="
|
735
|
-
|
736
|
-
|
786
|
+
"--speculative-accept-threshold-single",
|
787
|
+
type=float,
|
788
|
+
help="Accept a draft token if its probability in the target model is greater than this threshold.",
|
789
|
+
default=ServerArgs.speculative_accept_threshold_single,
|
790
|
+
)
|
791
|
+
parser.add_argument(
|
792
|
+
"--speculative-accept-threshold-acc",
|
793
|
+
type=float,
|
794
|
+
help="The accept probability of a draft token is raised from its target probability p to min(1, p / threshold_acc).",
|
795
|
+
default=ServerArgs.speculative_accept_threshold_acc,
|
796
|
+
)
|
797
|
+
parser.add_argument(
|
798
|
+
"--speculative-token-map",
|
799
|
+
type=str,
|
800
|
+
help="The path of the draft model's small vocab table.",
|
801
|
+
default=ServerArgs.speculative_token_map,
|
737
802
|
)
|
738
803
|
|
739
804
|
# Double Sparsity
|
@@ -779,11 +844,6 @@ class ServerArgs:
|
|
779
844
|
action="store_true",
|
780
845
|
help="Disable RadixAttention for prefix caching.",
|
781
846
|
)
|
782
|
-
parser.add_argument(
|
783
|
-
"--disable-jump-forward",
|
784
|
-
action="store_true",
|
785
|
-
help="Disable jump-forward for grammar-guided decoding.",
|
786
|
-
)
|
787
847
|
parser.add_argument(
|
788
848
|
"--disable-cuda-graph",
|
789
849
|
action="store_true",
|
@@ -913,12 +973,6 @@ class ServerArgs:
|
|
913
973
|
action="store_true",
|
914
974
|
help="Enable users to pass custom logit processors to the server (disabled by default for security)",
|
915
975
|
)
|
916
|
-
parser.add_argument(
|
917
|
-
"--return-hidden-states",
|
918
|
-
action="store_true",
|
919
|
-
help="Return hidden states in the response.",
|
920
|
-
)
|
921
|
-
# Function Calling
|
922
976
|
parser.add_argument(
|
923
977
|
"--tool-call-parser",
|
924
978
|
type=str,
|
@@ -932,6 +986,35 @@ class ServerArgs:
|
|
932
986
|
help="Enable hierarchical cache",
|
933
987
|
)
|
934
988
|
|
989
|
+
# Server warmups
|
990
|
+
parser.add_argument(
|
991
|
+
"--warmups",
|
992
|
+
type=str,
|
993
|
+
required=False,
|
994
|
+
help="Specify custom warmup functions (csv) to run before server starts eg. --warmups=warmup_name1,warmup_name2 "
|
995
|
+
"will run the functions `warmup_name1` and `warmup_name2` specified in warmup.py before the server starts listening for requests",
|
996
|
+
)
|
997
|
+
|
998
|
+
# Debug tensor dumps
|
999
|
+
parser.add_argument(
|
1000
|
+
"--debug-tensor-dump-output-folder",
|
1001
|
+
type=str,
|
1002
|
+
default=ServerArgs.debug_tensor_dump_output_folder,
|
1003
|
+
help="The output folder for dumping tensors.",
|
1004
|
+
)
|
1005
|
+
parser.add_argument(
|
1006
|
+
"--debug-tensor-dump-input-file",
|
1007
|
+
type=str,
|
1008
|
+
default=ServerArgs.debug_tensor_dump_input_file,
|
1009
|
+
help="The input filename for dumping tensors",
|
1010
|
+
)
|
1011
|
+
parser.add_argument(
|
1012
|
+
"--debug-tensor-dump-inject",
|
1013
|
+
type=str,
|
1014
|
+
default=ServerArgs.debug_tensor_dump_inject,
|
1015
|
+
help="Inject the outputs from jax as the input of every layer.",
|
1016
|
+
)
|
1017
|
+
|
935
1018
|
@classmethod
|
936
1019
|
def from_cli_args(cls, args: argparse.Namespace):
|
937
1020
|
args.tp_size = args.tensor_parallel_size
|
@@ -960,6 +1043,7 @@ class ServerArgs:
|
|
960
1043
|
and (self.lora_paths is None or self.disable_radix_cache)
|
961
1044
|
), "compatibility of lora and cuda graph and radix attention is in progress"
|
962
1045
|
assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
|
1046
|
+
assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
|
963
1047
|
|
964
1048
|
if isinstance(self.lora_paths, list):
|
965
1049
|
lora_paths = self.lora_paths
|
@@ -3,14 +3,8 @@
|
|
3
3
|
from typing import List
|
4
4
|
|
5
5
|
import torch
|
6
|
-
|
7
|
-
from
|
8
|
-
|
9
|
-
if is_cuda_available():
|
10
|
-
from sgl_kernel import build_tree_kernel as sgl_build_tree_kernel
|
11
|
-
from sgl_kernel import (
|
12
|
-
build_tree_kernel_efficient as sgl_build_tree_kernel_efficient,
|
13
|
-
)
|
6
|
+
from sgl_kernel import build_tree_kernel as sgl_build_tree_kernel
|
7
|
+
from sgl_kernel import build_tree_kernel_efficient as sgl_build_tree_kernel_efficient
|
14
8
|
|
15
9
|
|
16
10
|
def build_tree_kernel_efficient_preprocess(
|
@@ -21,7 +21,6 @@ from sglang.srt.model_executor.forward_batch_info import (
|
|
21
21
|
from sglang.srt.speculative.eagle_utils import EagleDraftInput
|
22
22
|
|
23
23
|
if TYPE_CHECKING:
|
24
|
-
from sglang.srt.model_executor.model_runner import ModelRunner
|
25
24
|
from sglang.srt.speculative.eagle_worker import EAGLEWorker
|
26
25
|
|
27
26
|
|