sglang 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +3 -1
- sglang/api.py +7 -7
- sglang/backend/anthropic.py +1 -1
- sglang/backend/litellm.py +90 -0
- sglang/backend/openai.py +158 -11
- sglang/backend/runtime_endpoint.py +18 -10
- sglang/bench_latency.py +299 -0
- sglang/global_config.py +12 -2
- sglang/lang/compiler.py +2 -2
- sglang/lang/interpreter.py +114 -67
- sglang/lang/ir.py +28 -3
- sglang/launch_server.py +4 -1
- sglang/launch_server_llavavid.py +2 -1
- sglang/srt/constrained/__init__.py +13 -6
- sglang/srt/constrained/fsm_cache.py +8 -2
- sglang/srt/constrained/jump_forward.py +113 -25
- sglang/srt/conversation.py +2 -0
- sglang/srt/flush_cache.py +3 -1
- sglang/srt/hf_transformers_utils.py +130 -1
- sglang/srt/layers/extend_attention.py +17 -0
- sglang/srt/layers/fused_moe.py +582 -0
- sglang/srt/layers/logits_processor.py +65 -32
- sglang/srt/layers/radix_attention.py +41 -7
- sglang/srt/layers/token_attention.py +16 -1
- sglang/srt/managers/controller/dp_worker.py +113 -0
- sglang/srt/managers/{router → controller}/infer_batch.py +242 -100
- sglang/srt/managers/controller/manager_multi.py +191 -0
- sglang/srt/managers/{router/manager.py → controller/manager_single.py} +34 -14
- sglang/srt/managers/{router → controller}/model_runner.py +262 -158
- sglang/srt/managers/{router → controller}/radix_cache.py +11 -1
- sglang/srt/managers/{router/scheduler.py → controller/schedule_heuristic.py} +9 -7
- sglang/srt/managers/{router/model_rpc.py → controller/tp_worker.py} +298 -267
- sglang/srt/managers/detokenizer_manager.py +42 -46
- sglang/srt/managers/io_struct.py +22 -12
- sglang/srt/managers/tokenizer_manager.py +151 -87
- sglang/srt/model_config.py +83 -5
- sglang/srt/models/chatglm.py +399 -0
- sglang/srt/models/commandr.py +10 -13
- sglang/srt/models/dbrx.py +9 -15
- sglang/srt/models/gemma.py +12 -15
- sglang/srt/models/grok.py +738 -0
- sglang/srt/models/llama2.py +26 -15
- sglang/srt/models/llama_classification.py +104 -0
- sglang/srt/models/llava.py +86 -19
- sglang/srt/models/llavavid.py +11 -20
- sglang/srt/models/mixtral.py +282 -103
- sglang/srt/models/mixtral_quant.py +372 -0
- sglang/srt/models/qwen.py +9 -13
- sglang/srt/models/qwen2.py +11 -13
- sglang/srt/models/stablelm.py +9 -15
- sglang/srt/models/yivl.py +17 -22
- sglang/srt/openai_api_adapter.py +150 -95
- sglang/srt/openai_protocol.py +11 -2
- sglang/srt/server.py +124 -48
- sglang/srt/server_args.py +128 -48
- sglang/srt/utils.py +234 -67
- sglang/test/test_programs.py +65 -3
- sglang/test/test_utils.py +32 -1
- sglang/utils.py +23 -4
- {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/METADATA +40 -27
- sglang-0.1.18.dist-info/RECORD +78 -0
- {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/WHEEL +1 -1
- sglang/srt/backend_config.py +0 -13
- sglang/srt/models/dbrx_config.py +0 -281
- sglang/srt/weight_utils.py +0 -417
- sglang-0.1.16.dist-info/RECORD +0 -72
- {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/LICENSE +0 -0
- {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
import argparse
|
4
4
|
import dataclasses
|
5
|
+
import random
|
5
6
|
from typing import List, Optional, Union
|
6
7
|
|
7
8
|
|
@@ -10,11 +11,13 @@ class ServerArgs:
|
|
10
11
|
# Model and tokenizer
|
11
12
|
model_path: str
|
12
13
|
tokenizer_path: Optional[str] = None
|
13
|
-
load_format: str = "auto"
|
14
14
|
tokenizer_mode: str = "auto"
|
15
|
-
|
15
|
+
load_format: str = "auto"
|
16
|
+
dtype: str = "auto"
|
16
17
|
trust_remote_code: bool = True
|
17
18
|
context_length: Optional[int] = None
|
19
|
+
quantization: Optional[str] = None
|
20
|
+
chat_template: Optional[str] = None
|
18
21
|
|
19
22
|
# Port
|
20
23
|
host: str = "127.0.0.1"
|
@@ -23,31 +26,40 @@ class ServerArgs:
|
|
23
26
|
|
24
27
|
# Memory and scheduling
|
25
28
|
mem_fraction_static: Optional[float] = None
|
26
|
-
|
29
|
+
max_prefill_tokens: Optional[int] = None
|
30
|
+
max_running_requests: Optional[int] = None
|
27
31
|
schedule_heuristic: str = "lpm"
|
28
32
|
schedule_conservativeness: float = 1.0
|
29
33
|
|
30
34
|
# Other runtime options
|
31
35
|
tp_size: int = 1
|
32
36
|
stream_interval: int = 8
|
33
|
-
random_seed: int =
|
37
|
+
random_seed: Optional[int] = None
|
34
38
|
|
35
39
|
# Logging
|
36
40
|
log_level: str = "info"
|
41
|
+
log_level_http: Optional[str] = None
|
37
42
|
log_requests: bool = False
|
38
|
-
disable_log_stats: bool = False
|
39
|
-
log_stats_interval: int = 10
|
40
43
|
show_time_cost: bool = False
|
41
44
|
|
42
45
|
# Other
|
43
46
|
api_key: str = ""
|
44
47
|
|
48
|
+
# Data parallelism
|
49
|
+
dp_size: int = 1
|
50
|
+
load_balance_method: str = "round_robin"
|
51
|
+
|
45
52
|
# Optimization/debug options
|
46
|
-
|
47
|
-
attention_reduce_in_fp32: bool = False
|
53
|
+
disable_flashinfer: bool = False
|
48
54
|
disable_radix_cache: bool = False
|
49
55
|
disable_regex_jump_forward: bool = False
|
50
56
|
disable_disk_cache: bool = False
|
57
|
+
attention_reduce_in_fp32: bool = False
|
58
|
+
|
59
|
+
# Distributed args
|
60
|
+
nccl_init_addr: Optional[str] = None
|
61
|
+
nnodes: int = 1
|
62
|
+
node_rank: Optional[int] = None
|
51
63
|
|
52
64
|
def __post_init__(self):
|
53
65
|
if self.tokenizer_path is None:
|
@@ -66,6 +78,9 @@ class ServerArgs:
|
|
66
78
|
elif self.additional_ports is None:
|
67
79
|
self.additional_ports = []
|
68
80
|
|
81
|
+
if self.random_seed is None:
|
82
|
+
self.random_seed = random.randint(0, 1 << 30)
|
83
|
+
|
69
84
|
@staticmethod
|
70
85
|
def add_cli_args(parser: argparse.ArgumentParser):
|
71
86
|
parser.add_argument(
|
@@ -91,7 +106,16 @@ class ServerArgs:
|
|
91
106
|
type=int,
|
92
107
|
nargs="*",
|
93
108
|
default=[],
|
94
|
-
help="
|
109
|
+
help="The additional ports specified for the server.",
|
110
|
+
)
|
111
|
+
parser.add_argument(
|
112
|
+
"--tokenizer-mode",
|
113
|
+
type=str,
|
114
|
+
default=ServerArgs.tokenizer_mode,
|
115
|
+
choices=["auto", "slow"],
|
116
|
+
help="Tokenizer mode. 'auto' will use the fast "
|
117
|
+
"tokenizer if available, and 'slow' will "
|
118
|
+
"always use the slow tokenizer.",
|
95
119
|
)
|
96
120
|
parser.add_argument(
|
97
121
|
"--load-format",
|
@@ -110,20 +134,20 @@ class ServerArgs:
|
|
110
134
|
"which is mainly for profiling.",
|
111
135
|
)
|
112
136
|
parser.add_argument(
|
113
|
-
"--
|
114
|
-
type=str,
|
115
|
-
default=ServerArgs.tokenizer_mode,
|
116
|
-
choices=["auto", "slow"],
|
117
|
-
help="Tokenizer mode. 'auto' will use the fast "
|
118
|
-
"tokenizer if available, and 'slow' will "
|
119
|
-
"always use the slow tokenizer.",
|
120
|
-
)
|
121
|
-
parser.add_argument(
|
122
|
-
"--chat-template",
|
137
|
+
"--dtype",
|
123
138
|
type=str,
|
124
|
-
default=ServerArgs.
|
125
|
-
|
126
|
-
|
139
|
+
default=ServerArgs.dtype,
|
140
|
+
choices=[
|
141
|
+
"auto", "half", "float16", "bfloat16", "float", "float32"
|
142
|
+
],
|
143
|
+
help='Data type for model weights and activations.\n\n'
|
144
|
+
'* "auto" will use FP16 precision for FP32 and FP16 models, and '
|
145
|
+
'BF16 precision for BF16 models.\n'
|
146
|
+
'* "half" for FP16. Recommended for AWQ quantization.\n'
|
147
|
+
'* "float16" is the same as "half".\n'
|
148
|
+
'* "bfloat16" for a balance between precision and range.\n'
|
149
|
+
'* "float" is shorthand for FP32 precision.\n'
|
150
|
+
'* "float32" for FP32 precision.')
|
127
151
|
parser.add_argument(
|
128
152
|
"--trust-remote-code",
|
129
153
|
action="store_true",
|
@@ -135,6 +159,18 @@ class ServerArgs:
|
|
135
159
|
default=ServerArgs.context_length,
|
136
160
|
help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
|
137
161
|
)
|
162
|
+
parser.add_argument(
|
163
|
+
"--quantization",
|
164
|
+
type=str,
|
165
|
+
default=ServerArgs.quantization,
|
166
|
+
help="The quantization method.",
|
167
|
+
)
|
168
|
+
parser.add_argument(
|
169
|
+
"--chat-template",
|
170
|
+
type=str,
|
171
|
+
default=ServerArgs.chat_template,
|
172
|
+
help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
|
173
|
+
)
|
138
174
|
parser.add_argument(
|
139
175
|
"--mem-fraction-static",
|
140
176
|
type=float,
|
@@ -142,17 +178,23 @@ class ServerArgs:
|
|
142
178
|
help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
|
143
179
|
)
|
144
180
|
parser.add_argument(
|
145
|
-
"--max-prefill-
|
181
|
+
"--max-prefill-tokens",
|
146
182
|
type=int,
|
147
|
-
default=ServerArgs.
|
183
|
+
default=ServerArgs.max_prefill_tokens,
|
148
184
|
help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
|
149
185
|
)
|
186
|
+
parser.add_argument(
|
187
|
+
"--max-running-requests",
|
188
|
+
type=int,
|
189
|
+
default=ServerArgs.max_running_requests,
|
190
|
+
help="The maximum number of running requests.",
|
191
|
+
)
|
150
192
|
parser.add_argument(
|
151
193
|
"--schedule-heuristic",
|
152
194
|
type=str,
|
153
195
|
default=ServerArgs.schedule_heuristic,
|
154
196
|
choices=["lpm", "random", "fcfs", "dfs-weight"],
|
155
|
-
help="
|
197
|
+
help="The scheduling heuristic.",
|
156
198
|
)
|
157
199
|
parser.add_argument(
|
158
200
|
"--schedule-conservativeness",
|
@@ -164,7 +206,7 @@ class ServerArgs:
|
|
164
206
|
"--tp-size",
|
165
207
|
type=int,
|
166
208
|
default=ServerArgs.tp_size,
|
167
|
-
help="
|
209
|
+
help="The tensor parallelism size.",
|
168
210
|
)
|
169
211
|
parser.add_argument(
|
170
212
|
"--stream-interval",
|
@@ -176,29 +218,24 @@ class ServerArgs:
|
|
176
218
|
"--random-seed",
|
177
219
|
type=int,
|
178
220
|
default=ServerArgs.random_seed,
|
179
|
-
help="
|
221
|
+
help="The random seed.",
|
180
222
|
)
|
181
223
|
parser.add_argument(
|
182
224
|
"--log-level",
|
183
225
|
type=str,
|
184
226
|
default=ServerArgs.log_level,
|
185
|
-
help="
|
227
|
+
help="The logging level of all loggers.",
|
186
228
|
)
|
187
229
|
parser.add_argument(
|
188
|
-
"--log-
|
189
|
-
|
190
|
-
|
230
|
+
"--log-level-http",
|
231
|
+
type=str,
|
232
|
+
default=ServerArgs.log_level_http,
|
233
|
+
help="The logging level of HTTP server. If not set, reuse --log-level by default.",
|
191
234
|
)
|
192
235
|
parser.add_argument(
|
193
|
-
"--
|
236
|
+
"--log-requests",
|
194
237
|
action="store_true",
|
195
|
-
help="
|
196
|
-
)
|
197
|
-
parser.add_argument(
|
198
|
-
"--log-stats-interval",
|
199
|
-
type=int,
|
200
|
-
default=ServerArgs.log_stats_interval,
|
201
|
-
help="Log stats interval in second.",
|
238
|
+
help="Log the inputs and outputs of all requests.",
|
202
239
|
)
|
203
240
|
parser.add_argument(
|
204
241
|
"--show-time-cost",
|
@@ -212,16 +249,47 @@ class ServerArgs:
|
|
212
249
|
help="Set API key of the server",
|
213
250
|
)
|
214
251
|
|
215
|
-
#
|
252
|
+
# Data parallelism
|
216
253
|
parser.add_argument(
|
217
|
-
"--
|
218
|
-
|
219
|
-
|
254
|
+
"--dp-size",
|
255
|
+
type=int,
|
256
|
+
default=ServerArgs.dp_size,
|
257
|
+
help="The data parallelism size.",
|
220
258
|
)
|
221
259
|
parser.add_argument(
|
222
|
-
"--
|
260
|
+
"--load-balance-method",
|
261
|
+
type=str,
|
262
|
+
default=ServerArgs.load_balance_method,
|
263
|
+
help="The load balancing strategy for data parallelism.",
|
264
|
+
choices=[
|
265
|
+
"round_robin",
|
266
|
+
"shortest_queue",
|
267
|
+
],
|
268
|
+
)
|
269
|
+
|
270
|
+
# Multi-node distributed serving args
|
271
|
+
parser.add_argument(
|
272
|
+
"--nccl-init-addr",
|
273
|
+
type=str,
|
274
|
+
help="The nccl init address of multi-node server."
|
275
|
+
)
|
276
|
+
parser.add_argument(
|
277
|
+
"--nnodes",
|
278
|
+
type=int,
|
279
|
+
default=1,
|
280
|
+
help="The number of nodes."
|
281
|
+
)
|
282
|
+
parser.add_argument(
|
283
|
+
"--node-rank",
|
284
|
+
type=int,
|
285
|
+
help="The node rank."
|
286
|
+
)
|
287
|
+
|
288
|
+
# Optimization/debug options
|
289
|
+
parser.add_argument(
|
290
|
+
"--disable-flashinfer",
|
223
291
|
action="store_true",
|
224
|
-
help="
|
292
|
+
help="Disable flashinfer inference kernels",
|
225
293
|
)
|
226
294
|
parser.add_argument(
|
227
295
|
"--disable-radix-cache",
|
@@ -238,6 +306,12 @@ class ServerArgs:
|
|
238
306
|
action="store_true",
|
239
307
|
help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
|
240
308
|
)
|
309
|
+
parser.add_argument(
|
310
|
+
"--attention-reduce-in-fp32",
|
311
|
+
action="store_true",
|
312
|
+
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
|
313
|
+
"This only affects Triton attention kernels",
|
314
|
+
)
|
241
315
|
|
242
316
|
@classmethod
|
243
317
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -249,7 +323,7 @@ class ServerArgs:
|
|
249
323
|
|
250
324
|
def print_mode_args(self):
|
251
325
|
return (
|
252
|
-
f"
|
326
|
+
f"disable_flashinfer={self.disable_flashinfer}, "
|
253
327
|
f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}, "
|
254
328
|
f"disable_radix_cache={self.disable_radix_cache}, "
|
255
329
|
f"disable_regex_jump_forward={self.disable_regex_jump_forward}, "
|
@@ -257,10 +331,16 @@ class ServerArgs:
|
|
257
331
|
)
|
258
332
|
|
259
333
|
|
334
|
+
@dataclasses.dataclass
|
335
|
+
class ModelPortArgs:
|
336
|
+
nccl_port: int
|
337
|
+
model_tp_ips: List[str]
|
338
|
+
model_tp_ports: List[int]
|
339
|
+
|
340
|
+
|
260
341
|
@dataclasses.dataclass
|
261
342
|
class PortArgs:
|
262
343
|
tokenizer_port: int
|
263
344
|
router_port: int
|
264
345
|
detokenizer_port: int
|
265
|
-
|
266
|
-
model_rpc_ports: List[int]
|
346
|
+
model_port_args: List[ModelPortArgs]
|