sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +1 -1
- sglang/bench_offline_throughput.py +19 -0
- sglang/bench_one_batch.py +2 -2
- sglang/bench_serving.py +123 -79
- sglang/global_config.py +8 -3
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/ir.py +1 -1
- sglang/srt/_custom_ops.py +83 -91
- sglang/srt/configs/load_config.py +4 -1
- sglang/srt/configs/model_config.py +48 -2
- sglang/srt/configs/qwen2_5_vl_config.py +5 -2
- sglang/srt/constrained/base_grammar_backend.py +117 -15
- sglang/srt/constrained/llguidance_backend.py +151 -0
- sglang/srt/constrained/outlines_backend.py +24 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -38
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
- sglang/srt/distributed/parallel_state.py +48 -3
- sglang/srt/entrypoints/engine.py +67 -9
- sglang/srt/entrypoints/http_server.py +190 -41
- sglang/srt/entrypoints/verl_engine.py +147 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/activation.py +11 -0
- sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
- sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_backend.py +302 -414
- sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
- sglang/srt/layers/attention/torch_native_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +13 -8
- sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
- sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
- sglang/srt/layers/attention/utils.py +39 -0
- sglang/srt/layers/attention/vision.py +60 -63
- sglang/srt/layers/dp_attention.py +142 -1
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/linear.py +3 -1
- sglang/srt/layers/logits_processor.py +281 -45
- sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
- sglang/srt/layers/moe/ep_moe/layer.py +140 -28
- sglang/srt/layers/moe/fused_moe_native.py +2 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
- sglang/srt/layers/moe/topk.py +13 -4
- sglang/srt/layers/quantization/__init__.py +111 -7
- sglang/srt/layers/quantization/blockwise_int8.py +409 -0
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/fp8.py +69 -28
- sglang/srt/layers/quantization/fp8_utils.py +17 -1
- sglang/srt/layers/quantization/gptq.py +416 -0
- sglang/srt/layers/quantization/int8_kernel.py +327 -0
- sglang/srt/layers/quantization/int8_utils.py +73 -0
- sglang/srt/layers/quantization/modelopt_quant.py +18 -1
- sglang/srt/layers/radix_attention.py +1 -0
- sglang/srt/layers/rotary_embedding.py +0 -1
- sglang/srt/layers/sampler.py +76 -31
- sglang/srt/layers/vocab_parallel_embedding.py +14 -13
- sglang/srt/lora/lora.py +17 -1
- sglang/srt/lora/lora_config.py +5 -0
- sglang/srt/lora/lora_manager.py +1 -3
- sglang/srt/managers/cache_controller.py +193 -62
- sglang/srt/managers/configure_logging.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +6 -2
- sglang/srt/managers/detokenizer_manager.py +124 -102
- sglang/srt/managers/image_processor.py +2 -1
- sglang/srt/managers/io_struct.py +144 -6
- sglang/srt/managers/schedule_batch.py +237 -197
- sglang/srt/managers/schedule_policy.py +29 -29
- sglang/srt/managers/scheduler.py +773 -334
- sglang/srt/managers/session_controller.py +6 -2
- sglang/srt/managers/tokenizer_manager.py +225 -68
- sglang/srt/managers/tp_worker.py +15 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
- sglang/srt/mem_cache/chunk_cache.py +18 -11
- sglang/srt/mem_cache/hiradix_cache.py +394 -0
- sglang/srt/mem_cache/memory_pool.py +68 -37
- sglang/srt/mem_cache/radix_cache.py +58 -47
- sglang/srt/metrics/collector.py +102 -36
- sglang/srt/model_executor/cuda_graph_runner.py +56 -31
- sglang/srt/model_executor/forward_batch_info.py +49 -16
- sglang/srt/model_executor/model_runner.py +280 -81
- sglang/srt/model_loader/loader.py +3 -3
- sglang/srt/model_loader/weight_utils.py +36 -14
- sglang/srt/models/baichuan.py +31 -6
- sglang/srt/models/chatglm.py +39 -7
- sglang/srt/models/commandr.py +29 -5
- sglang/srt/models/dbrx.py +31 -5
- sglang/srt/models/deepseek.py +43 -6
- sglang/srt/models/deepseek_nextn.py +32 -19
- sglang/srt/models/deepseek_v2.py +265 -32
- sglang/srt/models/exaone.py +19 -9
- sglang/srt/models/gemma.py +22 -8
- sglang/srt/models/gemma2.py +25 -12
- sglang/srt/models/gemma2_reward.py +5 -1
- sglang/srt/models/gpt2.py +28 -13
- sglang/srt/models/gpt_bigcode.py +27 -5
- sglang/srt/models/granite.py +21 -9
- sglang/srt/models/grok.py +21 -4
- sglang/srt/models/internlm2.py +36 -6
- sglang/srt/models/internlm2_reward.py +5 -1
- sglang/srt/models/llama.py +26 -9
- sglang/srt/models/llama_classification.py +5 -1
- sglang/srt/models/llama_eagle.py +17 -4
- sglang/srt/models/llama_embedding.py +5 -1
- sglang/srt/models/llama_reward.py +7 -2
- sglang/srt/models/llava.py +19 -3
- sglang/srt/models/llavavid.py +10 -1
- sglang/srt/models/minicpm.py +26 -2
- sglang/srt/models/minicpm3.py +39 -3
- sglang/srt/models/minicpmv.py +45 -14
- sglang/srt/models/mixtral.py +20 -9
- sglang/srt/models/mixtral_quant.py +50 -8
- sglang/srt/models/mllama.py +57 -11
- sglang/srt/models/olmo.py +34 -6
- sglang/srt/models/olmo2.py +34 -13
- sglang/srt/models/olmoe.py +26 -4
- sglang/srt/models/phi3_small.py +29 -10
- sglang/srt/models/qwen.py +26 -3
- sglang/srt/models/qwen2.py +26 -4
- sglang/srt/models/qwen2_5_vl.py +46 -8
- sglang/srt/models/qwen2_eagle.py +17 -5
- sglang/srt/models/qwen2_moe.py +44 -6
- sglang/srt/models/qwen2_rm.py +78 -0
- sglang/srt/models/qwen2_vl.py +39 -8
- sglang/srt/models/stablelm.py +32 -5
- sglang/srt/models/torch_native_llama.py +5 -2
- sglang/srt/models/xverse.py +21 -9
- sglang/srt/models/xverse_moe.py +45 -7
- sglang/srt/models/yivl.py +2 -1
- sglang/srt/openai_api/adapter.py +109 -24
- sglang/srt/openai_api/protocol.py +17 -1
- sglang/srt/reasoning_parser.py +154 -0
- sglang/srt/sampling/penaltylib/__init__.py +4 -6
- sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
- sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
- sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
- sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
- sglang/srt/sampling/sampling_batch_info.py +79 -157
- sglang/srt/sampling/sampling_params.py +16 -13
- sglang/srt/server_args.py +135 -60
- sglang/srt/speculative/build_eagle_tree.py +8 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -12
- sglang/srt/speculative/eagle_utils.py +92 -57
- sglang/srt/speculative/eagle_worker.py +238 -111
- sglang/srt/speculative/spec_info.py +1 -13
- sglang/srt/utils.py +43 -17
- sglang/srt/warmup.py +47 -0
- sglang/test/few_shot_gsm8k.py +4 -1
- sglang/test/runners.py +389 -126
- sglang/test/send_one.py +88 -0
- sglang/test/test_block_fp8_ep.py +361 -0
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +138 -84
- sglang/utils.py +50 -60
- sglang/version.py +1 -1
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/METADATA +22 -15
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/RECORD +200 -166
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/WHEEL +1 -1
- sglang/bench_latency.py +0 -1
- sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
- sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
- sglang/test/srt/sampling/penaltylib/utils.py +0 -344
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/LICENSE +0 -0
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -23,6 +23,7 @@ from typing import List, Optional
|
|
23
23
|
import torch
|
24
24
|
|
25
25
|
from sglang.srt.hf_transformers_utils import check_gguf_file
|
26
|
+
from sglang.srt.reasoning_parser import ReasoningParser
|
26
27
|
from sglang.srt.utils import (
|
27
28
|
get_amdgpu_memory_capacity,
|
28
29
|
get_hpu_memory_capacity,
|
@@ -43,19 +44,19 @@ class ServerArgs:
|
|
43
44
|
model_path: str
|
44
45
|
tokenizer_path: Optional[str] = None
|
45
46
|
tokenizer_mode: str = "auto"
|
47
|
+
skip_tokenizer_init: bool = False
|
46
48
|
load_format: str = "auto"
|
47
|
-
trust_remote_code: bool =
|
49
|
+
trust_remote_code: bool = False
|
48
50
|
dtype: str = "auto"
|
49
51
|
kv_cache_dtype: str = "auto"
|
50
|
-
quantization_param_path: nullable_str = None
|
51
52
|
quantization: Optional[str] = None
|
53
|
+
quantization_param_path: nullable_str = None
|
52
54
|
context_length: Optional[int] = None
|
53
55
|
device: str = "cuda"
|
54
56
|
served_model_name: Optional[str] = None
|
55
57
|
chat_template: Optional[str] = None
|
56
58
|
is_embedding: bool = False
|
57
59
|
revision: Optional[str] = None
|
58
|
-
skip_tokenizer_init: bool = False
|
59
60
|
|
60
61
|
# Port for the HTTP server
|
61
62
|
host: str = "127.0.0.1"
|
@@ -67,10 +68,9 @@ class ServerArgs:
|
|
67
68
|
max_total_tokens: Optional[int] = None
|
68
69
|
chunked_prefill_size: Optional[int] = None
|
69
70
|
max_prefill_tokens: int = 16384
|
70
|
-
schedule_policy: str = "
|
71
|
+
schedule_policy: str = "fcfs"
|
71
72
|
schedule_conservativeness: float = 1.0
|
72
73
|
cpu_offload_gb: int = 0
|
73
|
-
prefill_only_one_req: bool = False
|
74
74
|
|
75
75
|
# Other runtime options
|
76
76
|
tp_size: int = 1
|
@@ -79,21 +79,25 @@ class ServerArgs:
|
|
79
79
|
random_seed: Optional[int] = None
|
80
80
|
constrained_json_whitespace_pattern: Optional[str] = None
|
81
81
|
watchdog_timeout: float = 300
|
82
|
+
dist_timeout: Optional[int] = None # timeout for torch.distributed
|
82
83
|
download_dir: Optional[str] = None
|
83
84
|
base_gpu_id: int = 0
|
85
|
+
gpu_id_step: int = 1
|
84
86
|
|
85
87
|
# Logging
|
86
88
|
log_level: str = "info"
|
87
89
|
log_level_http: Optional[str] = None
|
88
90
|
log_requests: bool = False
|
91
|
+
log_requests_level: int = 0
|
89
92
|
show_time_cost: bool = False
|
90
93
|
enable_metrics: bool = False
|
91
94
|
decode_log_interval: int = 40
|
92
95
|
|
93
96
|
# API related
|
94
97
|
api_key: Optional[str] = None
|
95
|
-
|
98
|
+
file_storage_path: str = "sglang_storage"
|
96
99
|
enable_cache_report: bool = False
|
100
|
+
reasoning_parser: Optional[str] = None
|
97
101
|
|
98
102
|
# Data parallelism
|
99
103
|
dp_size: int = 1
|
@@ -121,11 +125,14 @@ class ServerArgs:
|
|
121
125
|
grammar_backend: Optional[str] = "outlines"
|
122
126
|
|
123
127
|
# Speculative decoding
|
124
|
-
speculative_draft_model_path: Optional[str] = None
|
125
128
|
speculative_algorithm: Optional[str] = None
|
129
|
+
speculative_draft_model_path: Optional[str] = None
|
126
130
|
speculative_num_steps: int = 5
|
127
|
-
|
128
|
-
|
131
|
+
speculative_eagle_topk: int = 4
|
132
|
+
speculative_num_draft_tokens: int = 8
|
133
|
+
speculative_accept_threshold_single: float = 1.0
|
134
|
+
speculative_accept_threshold_acc: float = 1.0
|
135
|
+
speculative_token_map: Optional[str] = None
|
129
136
|
|
130
137
|
# Double Sparsity
|
131
138
|
enable_double_sparsity: bool = False
|
@@ -137,7 +144,6 @@ class ServerArgs:
|
|
137
144
|
|
138
145
|
# Optimization/debug options
|
139
146
|
disable_radix_cache: bool = False
|
140
|
-
disable_jump_forward: bool = False
|
141
147
|
disable_cuda_graph: bool = False
|
142
148
|
disable_cuda_graph_padding: bool = False
|
143
149
|
enable_nccl_nvls: bool = False
|
@@ -161,14 +167,17 @@ class ServerArgs:
|
|
161
167
|
delete_ckpt_after_loading: bool = False
|
162
168
|
enable_memory_saver: bool = False
|
163
169
|
allow_auto_truncate: bool = False
|
164
|
-
return_hidden_states: bool = False
|
165
|
-
|
166
|
-
# Custom logit processor
|
167
170
|
enable_custom_logit_processor: bool = False
|
168
171
|
tool_call_parser: str = None
|
169
172
|
enable_hierarchical_cache: bool = False
|
170
|
-
|
171
173
|
enable_flashinfer_mla: bool = False
|
174
|
+
flashinfer_mla_disable_ragged: bool = False
|
175
|
+
warmups: Optional[str] = None
|
176
|
+
|
177
|
+
# Debug tensor dumps
|
178
|
+
debug_tensor_dump_output_folder: Optional[str] = None
|
179
|
+
debug_tensor_dump_input_file: Optional[str] = None
|
180
|
+
debug_tensor_dump_inject: bool = False
|
172
181
|
|
173
182
|
def __post_init__(self):
|
174
183
|
# Set missing default values
|
@@ -262,18 +271,22 @@ class ServerArgs:
|
|
262
271
|
)
|
263
272
|
|
264
273
|
# Speculative Decoding
|
265
|
-
if
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
self.
|
271
|
-
|
274
|
+
if self.speculative_algorithm == "NEXTN":
|
275
|
+
# NEXTN shares the same implementation of EAGLE
|
276
|
+
self.speculative_algorithm = "EAGLE"
|
277
|
+
|
278
|
+
if self.speculative_algorithm == "EAGLE":
|
279
|
+
if self.max_running_requests is None:
|
280
|
+
self.max_running_requests = 32
|
272
281
|
self.disable_overlap_schedule = True
|
273
|
-
self.
|
282
|
+
self.disable_cuda_graph_padding = True
|
274
283
|
logger.info(
|
275
|
-
|
284
|
+
"Overlap scheduler are disabled because of using "
|
285
|
+
"eagle speculative decoding."
|
276
286
|
)
|
287
|
+
# The token generated from the verify step is counted.
|
288
|
+
# If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
|
289
|
+
# assert self.speculative_num_steps < self.speculative_num_draft_tokens
|
277
290
|
|
278
291
|
# GGUF
|
279
292
|
if (
|
@@ -377,15 +390,6 @@ class ServerArgs:
|
|
377
390
|
choices=["auto", "fp8_e5m2", "fp8_e4m3"],
|
378
391
|
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
|
379
392
|
)
|
380
|
-
parser.add_argument(
|
381
|
-
"--quantization-param-path",
|
382
|
-
type=nullable_str,
|
383
|
-
default=None,
|
384
|
-
help="Path to the JSON file containing the KV cache "
|
385
|
-
"scaling factors. This should generally be supplied, when "
|
386
|
-
"KV cache dtype is FP8. Otherwise, KV cache scaling factors "
|
387
|
-
"default to 1.0, which may cause accuracy issues. ",
|
388
|
-
)
|
389
393
|
parser.add_argument(
|
390
394
|
"--quantization",
|
391
395
|
type=str,
|
@@ -404,6 +408,15 @@ class ServerArgs:
|
|
404
408
|
],
|
405
409
|
help="The quantization method.",
|
406
410
|
)
|
411
|
+
parser.add_argument(
|
412
|
+
"--quantization-param-path",
|
413
|
+
type=nullable_str,
|
414
|
+
default=None,
|
415
|
+
help="Path to the JSON file containing the KV cache "
|
416
|
+
"scaling factors. This should generally be supplied, when "
|
417
|
+
"KV cache dtype is FP8. Otherwise, KV cache scaling factors "
|
418
|
+
"default to 1.0, which may cause accuracy issues. ",
|
419
|
+
)
|
407
420
|
parser.add_argument(
|
408
421
|
"--context-length",
|
409
422
|
type=int,
|
@@ -493,12 +506,6 @@ class ServerArgs:
|
|
493
506
|
default=ServerArgs.cpu_offload_gb,
|
494
507
|
help="How many GBs of RAM to reserve for CPU offloading",
|
495
508
|
)
|
496
|
-
parser.add_argument(
|
497
|
-
"--prefill-only-one-req",
|
498
|
-
type=bool,
|
499
|
-
help="If true, we only prefill one request at one prefill batch",
|
500
|
-
default=ServerArgs.prefill_only_one_req,
|
501
|
-
)
|
502
509
|
|
503
510
|
# Other runtime options
|
504
511
|
parser.add_argument(
|
@@ -537,11 +544,17 @@ class ServerArgs:
|
|
537
544
|
default=ServerArgs.watchdog_timeout,
|
538
545
|
help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
|
539
546
|
)
|
547
|
+
parser.add_argument(
|
548
|
+
"--dist-timeout",
|
549
|
+
type=int,
|
550
|
+
default=ServerArgs.dist_timeout,
|
551
|
+
help="Set timeout for torch.distributed initialization.",
|
552
|
+
)
|
540
553
|
parser.add_argument(
|
541
554
|
"--download-dir",
|
542
555
|
type=str,
|
543
556
|
default=ServerArgs.download_dir,
|
544
|
-
help="Model download directory.",
|
557
|
+
help="Model download directory for huggingface.",
|
545
558
|
)
|
546
559
|
parser.add_argument(
|
547
560
|
"--base-gpu-id",
|
@@ -549,6 +562,12 @@ class ServerArgs:
|
|
549
562
|
default=ServerArgs.base_gpu_id,
|
550
563
|
help="The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine.",
|
551
564
|
)
|
565
|
+
parser.add_argument(
|
566
|
+
"--gpu-id-step",
|
567
|
+
type=int,
|
568
|
+
default=ServerArgs.gpu_id_step,
|
569
|
+
help="The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...",
|
570
|
+
)
|
552
571
|
|
553
572
|
# Logging
|
554
573
|
parser.add_argument(
|
@@ -566,7 +585,14 @@ class ServerArgs:
|
|
566
585
|
parser.add_argument(
|
567
586
|
"--log-requests",
|
568
587
|
action="store_true",
|
569
|
-
help="Log
|
588
|
+
help="Log metadata, inputs, outputs of all requests. The verbosity is decided by --log-requests-level",
|
589
|
+
)
|
590
|
+
parser.add_argument(
|
591
|
+
"--log-requests-level",
|
592
|
+
type=int,
|
593
|
+
default=0,
|
594
|
+
help="0: Log metadata. 1. Log metadata and partial input/output. 2. Log every input/output.",
|
595
|
+
choices=[0, 1, 2],
|
570
596
|
)
|
571
597
|
parser.add_argument(
|
572
598
|
"--show-time-cost",
|
@@ -593,9 +619,9 @@ class ServerArgs:
|
|
593
619
|
help="Set API key of the server. It is also used in the OpenAI API compatible server.",
|
594
620
|
)
|
595
621
|
parser.add_argument(
|
596
|
-
"--file-storage-
|
622
|
+
"--file-storage-path",
|
597
623
|
type=str,
|
598
|
-
default=ServerArgs.
|
624
|
+
default=ServerArgs.file_storage_path,
|
599
625
|
help="The path of the file storage in backend.",
|
600
626
|
)
|
601
627
|
parser.add_argument(
|
@@ -603,6 +629,13 @@ class ServerArgs:
|
|
603
629
|
action="store_true",
|
604
630
|
help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
|
605
631
|
)
|
632
|
+
parser.add_argument(
|
633
|
+
"--reasoning-parser",
|
634
|
+
type=str,
|
635
|
+
choices=list(ReasoningParser.DetectorMap.keys()),
|
636
|
+
default=ServerArgs.reasoning_parser,
|
637
|
+
help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
|
638
|
+
)
|
606
639
|
|
607
640
|
# Data parallelism
|
608
641
|
parser.add_argument(
|
@@ -694,7 +727,7 @@ class ServerArgs:
|
|
694
727
|
parser.add_argument(
|
695
728
|
"--grammar-backend",
|
696
729
|
type=str,
|
697
|
-
choices=["xgrammar", "outlines"],
|
730
|
+
choices=["xgrammar", "outlines", "llguidance"],
|
698
731
|
default=ServerArgs.grammar_backend,
|
699
732
|
help="Choose the backend for grammar-guided decoding.",
|
700
733
|
)
|
@@ -703,6 +736,11 @@ class ServerArgs:
|
|
703
736
|
action="store_true",
|
704
737
|
help="Enable FlashInfer MLA optimization",
|
705
738
|
)
|
739
|
+
parser.add_argument(
|
740
|
+
"--flashinfer-mla-disable-ragged",
|
741
|
+
action="store_true",
|
742
|
+
help="Not using ragged prefill wrapper when running flashinfer mla",
|
743
|
+
)
|
706
744
|
|
707
745
|
# Speculative decoding
|
708
746
|
parser.add_argument(
|
@@ -722,18 +760,36 @@ class ServerArgs:
|
|
722
760
|
help="The number of steps sampled from draft model in Speculative Decoding.",
|
723
761
|
default=ServerArgs.speculative_num_steps,
|
724
762
|
)
|
763
|
+
parser.add_argument(
|
764
|
+
"--speculative-eagle-topk",
|
765
|
+
type=int,
|
766
|
+
help="The number of tokens sampled from the draft model in eagle2 each step.",
|
767
|
+
choices=[1, 2, 4, 8],
|
768
|
+
default=ServerArgs.speculative_eagle_topk,
|
769
|
+
)
|
725
770
|
parser.add_argument(
|
726
771
|
"--speculative-num-draft-tokens",
|
727
772
|
type=int,
|
728
|
-
help="The number of
|
773
|
+
help="The number of tokens sampled from the draft model in Speculative Decoding.",
|
729
774
|
default=ServerArgs.speculative_num_draft_tokens,
|
730
775
|
)
|
731
776
|
parser.add_argument(
|
732
|
-
"--speculative-
|
733
|
-
type=
|
734
|
-
help="
|
735
|
-
|
736
|
-
|
777
|
+
"--speculative-accept-threshold-single",
|
778
|
+
type=float,
|
779
|
+
help="Accept a draft token if its probability in the target model is greater than this threshold.",
|
780
|
+
default=ServerArgs.speculative_accept_threshold_single,
|
781
|
+
)
|
782
|
+
parser.add_argument(
|
783
|
+
"--speculative-accept-threshold-acc",
|
784
|
+
type=float,
|
785
|
+
help="The accept probability of a draft token is raised from its target probability p to min(1, p / threshold_acc).",
|
786
|
+
default=ServerArgs.speculative_accept_threshold_acc,
|
787
|
+
)
|
788
|
+
parser.add_argument(
|
789
|
+
"--speculative-token-map",
|
790
|
+
type=str,
|
791
|
+
help="The path of the draft model's small vocab table.",
|
792
|
+
default=ServerArgs.speculative_token_map,
|
737
793
|
)
|
738
794
|
|
739
795
|
# Double Sparsity
|
@@ -779,11 +835,6 @@ class ServerArgs:
|
|
779
835
|
action="store_true",
|
780
836
|
help="Disable RadixAttention for prefix caching.",
|
781
837
|
)
|
782
|
-
parser.add_argument(
|
783
|
-
"--disable-jump-forward",
|
784
|
-
action="store_true",
|
785
|
-
help="Disable jump-forward for grammar-guided decoding.",
|
786
|
-
)
|
787
838
|
parser.add_argument(
|
788
839
|
"--disable-cuda-graph",
|
789
840
|
action="store_true",
|
@@ -913,12 +964,6 @@ class ServerArgs:
|
|
913
964
|
action="store_true",
|
914
965
|
help="Enable users to pass custom logit processors to the server (disabled by default for security)",
|
915
966
|
)
|
916
|
-
parser.add_argument(
|
917
|
-
"--return-hidden-states",
|
918
|
-
action="store_true",
|
919
|
-
help="Return hidden states in the response.",
|
920
|
-
)
|
921
|
-
# Function Calling
|
922
967
|
parser.add_argument(
|
923
968
|
"--tool-call-parser",
|
924
969
|
type=str,
|
@@ -932,6 +977,35 @@ class ServerArgs:
|
|
932
977
|
help="Enable hierarchical cache",
|
933
978
|
)
|
934
979
|
|
980
|
+
# Server warmups
|
981
|
+
parser.add_argument(
|
982
|
+
"--warmups",
|
983
|
+
type=str,
|
984
|
+
required=False,
|
985
|
+
help="Specify custom warmup functions (csv) to run before server starts eg. --warmups=warmup_name1,warmup_name2 "
|
986
|
+
"will run the functions `warmup_name1` and `warmup_name2` specified in warmup.py before the server starts listening for requests",
|
987
|
+
)
|
988
|
+
|
989
|
+
# Debug tensor dumps
|
990
|
+
parser.add_argument(
|
991
|
+
"--debug-tensor-dump-output-folder",
|
992
|
+
type=str,
|
993
|
+
default=ServerArgs.debug_tensor_dump_output_folder,
|
994
|
+
help="The output folder for dumping tensors.",
|
995
|
+
)
|
996
|
+
parser.add_argument(
|
997
|
+
"--debug-tensor-dump-input-file",
|
998
|
+
type=str,
|
999
|
+
default=ServerArgs.debug_tensor_dump_input_file,
|
1000
|
+
help="The input filename for dumping tensors",
|
1001
|
+
)
|
1002
|
+
parser.add_argument(
|
1003
|
+
"--debug-tensor-dump-inject",
|
1004
|
+
type=str,
|
1005
|
+
default=ServerArgs.debug_tensor_dump_inject,
|
1006
|
+
help="Inject the outputs from jax as the input of every layer.",
|
1007
|
+
)
|
1008
|
+
|
935
1009
|
@classmethod
|
936
1010
|
def from_cli_args(cls, args: argparse.Namespace):
|
937
1011
|
args.tp_size = args.tensor_parallel_size
|
@@ -960,6 +1034,7 @@ class ServerArgs:
|
|
960
1034
|
and (self.lora_paths is None or self.disable_radix_cache)
|
961
1035
|
), "compatibility of lora and cuda graph and radix attention is in progress"
|
962
1036
|
assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
|
1037
|
+
assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
|
963
1038
|
|
964
1039
|
if isinstance(self.lora_paths, list):
|
965
1040
|
lora_paths = self.lora_paths
|
@@ -3,14 +3,8 @@
|
|
3
3
|
from typing import List
|
4
4
|
|
5
5
|
import torch
|
6
|
-
|
7
|
-
from
|
8
|
-
|
9
|
-
if is_cuda_available():
|
10
|
-
from sgl_kernel import build_tree_kernel as sgl_build_tree_kernel
|
11
|
-
from sgl_kernel import (
|
12
|
-
build_tree_kernel_efficient as sgl_build_tree_kernel_efficient,
|
13
|
-
)
|
6
|
+
from sgl_kernel import build_tree_kernel as sgl_build_tree_kernel
|
7
|
+
from sgl_kernel import build_tree_kernel_efficient as sgl_build_tree_kernel_efficient
|
14
8
|
|
15
9
|
|
16
10
|
def build_tree_kernel_efficient_preprocess(
|
@@ -32,7 +26,12 @@ def build_tree_kernel_efficient_preprocess(
|
|
32
26
|
|
33
27
|
draft_tokens = torch.gather(ss_token_list, index=top_scores_index, dim=1)
|
34
28
|
draft_tokens = torch.cat((verified_id.unsqueeze(1), draft_tokens), dim=1).flatten()
|
35
|
-
|
29
|
+
|
30
|
+
if len(parents_list) > 1:
|
31
|
+
parent_list = torch.cat(parents_list[:-1], dim=1)
|
32
|
+
else:
|
33
|
+
batch_size = parents_list[0].shape[0]
|
34
|
+
parent_list = torch.empty(batch_size, 0, device=parents_list[0].device)
|
36
35
|
|
37
36
|
return parent_list, top_scores_index, draft_tokens
|
38
37
|
|
@@ -1,7 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import bisect
|
4
|
-
import time
|
5
4
|
from typing import TYPE_CHECKING, Callable
|
6
5
|
|
7
6
|
import torch
|
@@ -21,7 +20,6 @@ from sglang.srt.model_executor.forward_batch_info import (
|
|
21
20
|
from sglang.srt.speculative.eagle_utils import EagleDraftInput
|
22
21
|
|
23
22
|
if TYPE_CHECKING:
|
24
|
-
from sglang.srt.model_executor.model_runner import ModelRunner
|
25
23
|
from sglang.srt.speculative.eagle_worker import EAGLEWorker
|
26
24
|
|
27
25
|
|
@@ -163,20 +161,11 @@ class EAGLEDraftCudaGraphRunner:
|
|
163
161
|
|
164
162
|
run_once()
|
165
163
|
|
166
|
-
torch.cuda.synchronize()
|
167
|
-
self.model_runner.tp_group.barrier()
|
168
|
-
|
169
|
-
torch.cuda.synchronize()
|
170
|
-
self.model_runner.tp_group.barrier()
|
171
|
-
|
172
164
|
with torch.cuda.graph(
|
173
165
|
graph, pool=get_global_graph_memory_pool(), stream=stream
|
174
166
|
):
|
175
167
|
out = run_once()
|
176
168
|
|
177
|
-
torch.cuda.synchronize()
|
178
|
-
self.model_runner.tp_group.barrier()
|
179
|
-
|
180
169
|
set_global_graph_memory_pool(graph.pool())
|
181
170
|
return graph, out
|
182
171
|
|
@@ -205,7 +194,7 @@ class EAGLEDraftCudaGraphRunner:
|
|
205
194
|
|
206
195
|
# Attention backend
|
207
196
|
self.model_runner.draft_attn_backend.init_forward_metadata_replay_cuda_graph(
|
208
|
-
forward_batch
|
197
|
+
forward_batch, forward_batch.batch_size
|
209
198
|
)
|
210
199
|
|
211
200
|
# Replay
|