sglang 0.1.14__py3-none-any.whl → 0.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +59 -2
- sglang/api.py +40 -11
- sglang/backend/anthropic.py +17 -3
- sglang/backend/litellm.py +90 -0
- sglang/backend/openai.py +160 -12
- sglang/backend/runtime_endpoint.py +62 -27
- sglang/backend/vertexai.py +1 -0
- sglang/bench_latency.py +320 -0
- sglang/global_config.py +24 -3
- sglang/lang/chat_template.py +122 -6
- sglang/lang/compiler.py +2 -2
- sglang/lang/interpreter.py +206 -98
- sglang/lang/ir.py +98 -34
- sglang/lang/tracer.py +6 -4
- sglang/launch_server.py +4 -1
- sglang/launch_server_llavavid.py +32 -0
- sglang/srt/constrained/__init__.py +14 -6
- sglang/srt/constrained/fsm_cache.py +9 -2
- sglang/srt/constrained/jump_forward.py +113 -24
- sglang/srt/conversation.py +4 -2
- sglang/srt/flush_cache.py +18 -0
- sglang/srt/hf_transformers_utils.py +144 -3
- sglang/srt/layers/context_flashattention_nopad.py +1 -0
- sglang/srt/layers/extend_attention.py +20 -1
- sglang/srt/layers/fused_moe.py +596 -0
- sglang/srt/layers/logits_processor.py +190 -61
- sglang/srt/layers/radix_attention.py +62 -53
- sglang/srt/layers/token_attention.py +21 -9
- sglang/srt/managers/controller/cuda_graph_runner.py +196 -0
- sglang/srt/managers/controller/dp_worker.py +113 -0
- sglang/srt/managers/controller/infer_batch.py +908 -0
- sglang/srt/managers/controller/manager_multi.py +195 -0
- sglang/srt/managers/controller/manager_single.py +177 -0
- sglang/srt/managers/controller/model_runner.py +359 -0
- sglang/srt/managers/{router → controller}/radix_cache.py +102 -53
- sglang/srt/managers/controller/schedule_heuristic.py +65 -0
- sglang/srt/managers/controller/tp_worker.py +813 -0
- sglang/srt/managers/detokenizer_manager.py +42 -40
- sglang/srt/managers/io_struct.py +44 -10
- sglang/srt/managers/tokenizer_manager.py +224 -82
- sglang/srt/memory_pool.py +52 -59
- sglang/srt/model_config.py +97 -2
- sglang/srt/models/chatglm.py +399 -0
- sglang/srt/models/commandr.py +369 -0
- sglang/srt/models/dbrx.py +406 -0
- sglang/srt/models/gemma.py +34 -38
- sglang/srt/models/gemma2.py +436 -0
- sglang/srt/models/grok.py +738 -0
- sglang/srt/models/llama2.py +47 -37
- sglang/srt/models/llama_classification.py +107 -0
- sglang/srt/models/llava.py +92 -27
- sglang/srt/models/llavavid.py +298 -0
- sglang/srt/models/minicpm.py +366 -0
- sglang/srt/models/mixtral.py +302 -127
- sglang/srt/models/mixtral_quant.py +372 -0
- sglang/srt/models/qwen.py +40 -35
- sglang/srt/models/qwen2.py +33 -36
- sglang/srt/models/qwen2_moe.py +473 -0
- sglang/srt/models/stablelm.py +33 -39
- sglang/srt/models/yivl.py +19 -26
- sglang/srt/openai_api_adapter.py +411 -0
- sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +44 -19
- sglang/srt/sampling_params.py +2 -0
- sglang/srt/server.py +197 -481
- sglang/srt/server_args.py +190 -74
- sglang/srt/utils.py +460 -95
- sglang/test/test_programs.py +73 -10
- sglang/test/test_utils.py +226 -7
- sglang/utils.py +97 -27
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/METADATA +74 -45
- sglang-0.1.21.dist-info/RECORD +82 -0
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/WHEEL +1 -1
- sglang/srt/backend_config.py +0 -13
- sglang/srt/managers/router/infer_batch.py +0 -503
- sglang/srt/managers/router/manager.py +0 -79
- sglang/srt/managers/router/model_rpc.py +0 -686
- sglang/srt/managers/router/model_runner.py +0 -514
- sglang/srt/managers/router/scheduler.py +0 -70
- sglang-0.1.14.dist-info/RECORD +0 -64
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/LICENSE +0 -0
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -1,56 +1,90 @@
|
|
1
|
+
"""The arguments of the server."""
|
2
|
+
|
1
3
|
import argparse
|
2
4
|
import dataclasses
|
5
|
+
import random
|
3
6
|
from typing import List, Optional, Union
|
4
7
|
|
5
8
|
|
6
9
|
@dataclasses.dataclass
|
7
10
|
class ServerArgs:
|
11
|
+
# Model and tokenizer
|
8
12
|
model_path: str
|
9
13
|
tokenizer_path: Optional[str] = None
|
14
|
+
tokenizer_mode: str = "auto"
|
15
|
+
load_format: str = "auto"
|
16
|
+
dtype: str = "auto"
|
17
|
+
trust_remote_code: bool = True
|
18
|
+
context_length: Optional[int] = None
|
19
|
+
quantization: Optional[str] = None
|
20
|
+
chat_template: Optional[str] = None
|
21
|
+
|
22
|
+
# Port
|
10
23
|
host: str = "127.0.0.1"
|
11
24
|
port: int = 30000
|
12
25
|
additional_ports: Optional[Union[List[int], int]] = None
|
13
|
-
|
14
|
-
|
15
|
-
chat_template: Optional[str] = None
|
16
|
-
trust_remote_code: bool = True
|
26
|
+
|
27
|
+
# Memory and scheduling
|
17
28
|
mem_fraction_static: Optional[float] = None
|
18
|
-
|
19
|
-
|
20
|
-
tp_size: int = 1
|
29
|
+
max_prefill_tokens: Optional[int] = None
|
30
|
+
max_running_requests: Optional[int] = None
|
21
31
|
schedule_heuristic: str = "lpm"
|
22
|
-
schedule_conservativeness: float =
|
23
|
-
|
24
|
-
|
32
|
+
schedule_conservativeness: float = 0.8
|
33
|
+
|
34
|
+
# Other runtime options
|
35
|
+
tp_size: int = 1
|
25
36
|
stream_interval: int = 8
|
26
|
-
|
27
|
-
|
37
|
+
random_seed: Optional[int] = None
|
38
|
+
|
39
|
+
# Logging
|
28
40
|
log_level: str = "info"
|
41
|
+
log_level_http: Optional[str] = None
|
42
|
+
log_requests: bool = False
|
43
|
+
show_time_cost: bool = False
|
44
|
+
|
45
|
+
# Other
|
46
|
+
api_key: str = ""
|
29
47
|
|
30
|
-
#
|
48
|
+
# Data parallelism
|
49
|
+
dp_size: int = 1
|
50
|
+
load_balance_method: str = "round_robin"
|
51
|
+
|
52
|
+
# Optimization/debug options
|
53
|
+
disable_flashinfer: bool = False
|
31
54
|
disable_radix_cache: bool = False
|
32
|
-
enable_flashinfer: bool = False
|
33
55
|
disable_regex_jump_forward: bool = False
|
56
|
+
disable_cuda_graph: bool = False
|
34
57
|
disable_disk_cache: bool = False
|
35
|
-
|
58
|
+
attention_reduce_in_fp32: bool = False
|
59
|
+
enable_p2p_check: bool = False
|
60
|
+
|
61
|
+
# Distributed args
|
62
|
+
nccl_init_addr: Optional[str] = None
|
63
|
+
nnodes: int = 1
|
64
|
+
node_rank: Optional[int] = None
|
36
65
|
|
37
66
|
def __post_init__(self):
|
38
67
|
if self.tokenizer_path is None:
|
39
68
|
self.tokenizer_path = self.model_path
|
40
69
|
if self.mem_fraction_static is None:
|
41
|
-
if self.tp_size >=
|
42
|
-
self.mem_fraction_static = 0.
|
70
|
+
if self.tp_size >= 16:
|
71
|
+
self.mem_fraction_static = 0.74
|
72
|
+
elif self.tp_size >= 8:
|
73
|
+
self.mem_fraction_static = 0.78
|
43
74
|
elif self.tp_size >= 4:
|
44
75
|
self.mem_fraction_static = 0.82
|
45
76
|
elif self.tp_size >= 2:
|
46
77
|
self.mem_fraction_static = 0.85
|
47
78
|
else:
|
48
|
-
self.mem_fraction_static = 0.
|
79
|
+
self.mem_fraction_static = 0.88
|
49
80
|
if isinstance(self.additional_ports, int):
|
50
81
|
self.additional_ports = [self.additional_ports]
|
51
82
|
elif self.additional_ports is None:
|
52
83
|
self.additional_ports = []
|
53
84
|
|
85
|
+
if self.random_seed is None:
|
86
|
+
self.random_seed = random.randint(0, 1 << 30)
|
87
|
+
|
54
88
|
@staticmethod
|
55
89
|
def add_cli_args(parser: argparse.ArgumentParser):
|
56
90
|
parser.add_argument(
|
@@ -65,15 +99,27 @@ class ServerArgs:
|
|
65
99
|
default=ServerArgs.tokenizer_path,
|
66
100
|
help="The path of the tokenizer.",
|
67
101
|
)
|
68
|
-
parser.add_argument(
|
69
|
-
|
70
|
-
|
102
|
+
parser.add_argument(
|
103
|
+
"--host", type=str, default=ServerArgs.host, help="The host of the server."
|
104
|
+
)
|
105
|
+
parser.add_argument(
|
106
|
+
"--port", type=int, default=ServerArgs.port, help="The port of the server."
|
107
|
+
)
|
71
108
|
parser.add_argument(
|
72
109
|
"--additional-ports",
|
73
110
|
type=int,
|
74
111
|
nargs="*",
|
75
112
|
default=[],
|
76
|
-
help="
|
113
|
+
help="The additional ports specified for the server.",
|
114
|
+
)
|
115
|
+
parser.add_argument(
|
116
|
+
"--tokenizer-mode",
|
117
|
+
type=str,
|
118
|
+
default=ServerArgs.tokenizer_mode,
|
119
|
+
choices=["auto", "slow"],
|
120
|
+
help="Tokenizer mode. 'auto' will use the fast "
|
121
|
+
"tokenizer if available, and 'slow' will "
|
122
|
+
"always use the slow tokenizer.",
|
77
123
|
)
|
78
124
|
parser.add_argument(
|
79
125
|
"--load-format",
|
@@ -92,25 +138,42 @@ class ServerArgs:
|
|
92
138
|
"which is mainly for profiling.",
|
93
139
|
)
|
94
140
|
parser.add_argument(
|
95
|
-
"--
|
141
|
+
"--dtype",
|
96
142
|
type=str,
|
97
|
-
default=ServerArgs.
|
98
|
-
choices=["auto", "
|
99
|
-
help="
|
100
|
-
"
|
101
|
-
"
|
102
|
-
|
103
|
-
|
104
|
-
"
|
105
|
-
|
106
|
-
|
107
|
-
help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server",
|
143
|
+
default=ServerArgs.dtype,
|
144
|
+
choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
|
145
|
+
help="Data type for model weights and activations.\n\n"
|
146
|
+
'* "auto" will use FP16 precision for FP32 and FP16 models, and '
|
147
|
+
"BF16 precision for BF16 models.\n"
|
148
|
+
'* "half" for FP16. Recommended for AWQ quantization.\n'
|
149
|
+
'* "float16" is the same as "half".\n'
|
150
|
+
'* "bfloat16" for a balance between precision and range.\n'
|
151
|
+
'* "float" is shorthand for FP32 precision.\n'
|
152
|
+
'* "float32" for FP32 precision.',
|
108
153
|
)
|
109
154
|
parser.add_argument(
|
110
155
|
"--trust-remote-code",
|
111
156
|
action="store_true",
|
112
157
|
help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
|
113
158
|
)
|
159
|
+
parser.add_argument(
|
160
|
+
"--context-length",
|
161
|
+
type=int,
|
162
|
+
default=ServerArgs.context_length,
|
163
|
+
help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
|
164
|
+
)
|
165
|
+
parser.add_argument(
|
166
|
+
"--quantization",
|
167
|
+
type=str,
|
168
|
+
default=ServerArgs.quantization,
|
169
|
+
help="The quantization method.",
|
170
|
+
)
|
171
|
+
parser.add_argument(
|
172
|
+
"--chat-template",
|
173
|
+
type=str,
|
174
|
+
default=ServerArgs.chat_template,
|
175
|
+
help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
|
176
|
+
)
|
114
177
|
parser.add_argument(
|
115
178
|
"--mem-fraction-static",
|
116
179
|
type=float,
|
@@ -118,28 +181,23 @@ class ServerArgs:
|
|
118
181
|
help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
|
119
182
|
)
|
120
183
|
parser.add_argument(
|
121
|
-
"--max-prefill-
|
184
|
+
"--max-prefill-tokens",
|
122
185
|
type=int,
|
123
|
-
default=ServerArgs.
|
186
|
+
default=ServerArgs.max_prefill_tokens,
|
124
187
|
help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
|
125
188
|
)
|
126
189
|
parser.add_argument(
|
127
|
-
"--
|
128
|
-
type=int,
|
129
|
-
default=ServerArgs.context_length,
|
130
|
-
help="The model's maximum context length. Use this to reduce the context length to save memory. Defaults to None (will use the value from the model's config.json instead).",
|
131
|
-
)
|
132
|
-
parser.add_argument(
|
133
|
-
"--tp-size",
|
190
|
+
"--max-running-requests",
|
134
191
|
type=int,
|
135
|
-
default=ServerArgs.
|
136
|
-
help="
|
192
|
+
default=ServerArgs.max_running_requests,
|
193
|
+
help="The maximum number of running requests.",
|
137
194
|
)
|
138
195
|
parser.add_argument(
|
139
196
|
"--schedule-heuristic",
|
140
197
|
type=str,
|
141
198
|
default=ServerArgs.schedule_heuristic,
|
142
|
-
|
199
|
+
choices=["lpm", "random", "fcfs", "dfs-weight"],
|
200
|
+
help="The scheduling heuristic.",
|
143
201
|
)
|
144
202
|
parser.add_argument(
|
145
203
|
"--schedule-conservativeness",
|
@@ -148,15 +206,10 @@ class ServerArgs:
|
|
148
206
|
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
|
149
207
|
)
|
150
208
|
parser.add_argument(
|
151
|
-
"--
|
209
|
+
"--tp-size",
|
152
210
|
type=int,
|
153
|
-
default=ServerArgs.
|
154
|
-
help="
|
155
|
-
)
|
156
|
-
parser.add_argument(
|
157
|
-
"--attention-reduce-in-fp32",
|
158
|
-
action="store_true",
|
159
|
-
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16.",
|
211
|
+
default=ServerArgs.tp_size,
|
212
|
+
help="The tensor parallelism size.",
|
160
213
|
)
|
161
214
|
parser.add_argument(
|
162
215
|
"--stream-interval",
|
@@ -164,49 +217,106 @@ class ServerArgs:
|
|
164
217
|
default=ServerArgs.stream_interval,
|
165
218
|
help="The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher",
|
166
219
|
)
|
220
|
+
parser.add_argument(
|
221
|
+
"--random-seed",
|
222
|
+
type=int,
|
223
|
+
default=ServerArgs.random_seed,
|
224
|
+
help="The random seed.",
|
225
|
+
)
|
167
226
|
parser.add_argument(
|
168
227
|
"--log-level",
|
169
228
|
type=str,
|
170
229
|
default=ServerArgs.log_level,
|
171
|
-
help="
|
230
|
+
help="The logging level of all loggers.",
|
172
231
|
)
|
173
232
|
parser.add_argument(
|
174
|
-
"--
|
233
|
+
"--log-level-http",
|
234
|
+
type=str,
|
235
|
+
default=ServerArgs.log_level_http,
|
236
|
+
help="The logging level of HTTP server. If not set, reuse --log-level by default.",
|
237
|
+
)
|
238
|
+
parser.add_argument(
|
239
|
+
"--log-requests",
|
175
240
|
action="store_true",
|
176
|
-
help="
|
241
|
+
help="Log the inputs and outputs of all requests.",
|
242
|
+
)
|
243
|
+
parser.add_argument(
|
244
|
+
"--show-time-cost",
|
245
|
+
action="store_true",
|
246
|
+
help="Show time cost of custom marks",
|
247
|
+
)
|
248
|
+
parser.add_argument(
|
249
|
+
"--api-key",
|
250
|
+
type=str,
|
251
|
+
default=ServerArgs.api_key,
|
252
|
+
help="Set API key of the server",
|
177
253
|
)
|
254
|
+
|
255
|
+
# Data parallelism
|
178
256
|
parser.add_argument(
|
179
|
-
"--
|
257
|
+
"--dp-size",
|
180
258
|
type=int,
|
181
|
-
default=ServerArgs.
|
182
|
-
help="
|
259
|
+
default=ServerArgs.dp_size,
|
260
|
+
help="The data parallelism size.",
|
183
261
|
)
|
184
|
-
# optional modes
|
185
262
|
parser.add_argument(
|
186
|
-
"--
|
263
|
+
"--load-balance-method",
|
264
|
+
type=str,
|
265
|
+
default=ServerArgs.load_balance_method,
|
266
|
+
help="The load balancing strategy for data parallelism.",
|
267
|
+
choices=[
|
268
|
+
"round_robin",
|
269
|
+
"shortest_queue",
|
270
|
+
],
|
271
|
+
)
|
272
|
+
|
273
|
+
# Multi-node distributed serving args
|
274
|
+
parser.add_argument(
|
275
|
+
"--nccl-init-addr",
|
276
|
+
type=str,
|
277
|
+
help="The nccl init address of multi-node server.",
|
278
|
+
)
|
279
|
+
parser.add_argument(
|
280
|
+
"--nnodes", type=int, default=1, help="The number of nodes."
|
281
|
+
)
|
282
|
+
parser.add_argument("--node-rank", type=int, help="The node rank.")
|
283
|
+
|
284
|
+
# Optimization/debug options
|
285
|
+
parser.add_argument(
|
286
|
+
"--disable-flashinfer",
|
187
287
|
action="store_true",
|
188
|
-
help="Disable
|
288
|
+
help="Disable flashinfer inference kernels",
|
189
289
|
)
|
190
290
|
parser.add_argument(
|
191
|
-
"--
|
291
|
+
"--disable-radix-cache",
|
192
292
|
action="store_true",
|
193
|
-
help="
|
293
|
+
help="Disable RadixAttention",
|
194
294
|
)
|
195
295
|
parser.add_argument(
|
196
296
|
"--disable-regex-jump-forward",
|
197
297
|
action="store_true",
|
198
298
|
help="Disable regex jump-forward",
|
199
299
|
)
|
300
|
+
parser.add_argument(
|
301
|
+
"--disable-cuda-graph",
|
302
|
+
action="store_true",
|
303
|
+
help="Disable cuda graph.",
|
304
|
+
)
|
200
305
|
parser.add_argument(
|
201
306
|
"--disable-disk-cache",
|
202
307
|
action="store_true",
|
203
308
|
help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
|
204
309
|
)
|
205
310
|
parser.add_argument(
|
206
|
-
"--
|
207
|
-
|
208
|
-
|
209
|
-
|
311
|
+
"--attention-reduce-in-fp32",
|
312
|
+
action="store_true",
|
313
|
+
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
|
314
|
+
"This only affects Triton attention kernels",
|
315
|
+
)
|
316
|
+
parser.add_argument(
|
317
|
+
"--enable-p2p-check",
|
318
|
+
action="store_true",
|
319
|
+
help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
|
210
320
|
)
|
211
321
|
|
212
322
|
@classmethod
|
@@ -217,20 +327,26 @@ class ServerArgs:
|
|
217
327
|
def url(self):
|
218
328
|
return f"http://{self.host}:{self.port}"
|
219
329
|
|
220
|
-
def
|
330
|
+
def print_mode_args(self):
|
221
331
|
return (
|
332
|
+
f"disable_flashinfer={self.disable_flashinfer}, "
|
333
|
+
f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}, "
|
222
334
|
f"disable_radix_cache={self.disable_radix_cache}, "
|
223
|
-
f"enable_flashinfer={self.enable_flashinfer}, "
|
224
335
|
f"disable_regex_jump_forward={self.disable_regex_jump_forward}, "
|
225
336
|
f"disable_disk_cache={self.disable_disk_cache}, "
|
226
|
-
f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}"
|
227
337
|
)
|
228
338
|
|
229
339
|
|
340
|
+
@dataclasses.dataclass
|
341
|
+
class ModelPortArgs:
|
342
|
+
nccl_port: int
|
343
|
+
model_tp_ips: List[str]
|
344
|
+
model_tp_ports: List[int]
|
345
|
+
|
346
|
+
|
230
347
|
@dataclasses.dataclass
|
231
348
|
class PortArgs:
|
232
349
|
tokenizer_port: int
|
233
350
|
router_port: int
|
234
351
|
detokenizer_port: int
|
235
|
-
|
236
|
-
model_rpc_ports: List[int]
|
352
|
+
model_port_args: List[ModelPortArgs]
|