sglang 0.3.5.post1__py3-none-any.whl → 0.3.5.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +309 -0
- sglang/bench_serving.py +44 -30
- sglang/srt/constrained/base_grammar_backend.py +4 -3
- sglang/srt/constrained/outlines_backend.py +24 -24
- sglang/srt/constrained/xgrammar_backend.py +40 -4
- sglang/srt/layers/fused_moe/patch.py +4 -2
- sglang/srt/managers/detokenizer_manager.py +0 -14
- sglang/srt/managers/scheduler.py +6 -2
- sglang/srt/model_executor/model_runner.py +4 -1
- sglang/srt/openai_api/adapter.py +5 -2
- sglang/srt/openai_api/protocol.py +29 -26
- sglang/srt/sampling/sampling_params.py +2 -2
- sglang/srt/server.py +2 -1
- sglang/srt/server_args.py +24 -3
- sglang/srt/utils.py +33 -0
- sglang/test/test_utils.py +4 -4
- sglang/version.py +1 -1
- {sglang-0.3.5.post1.dist-info → sglang-0.3.5.post2.dist-info}/METADATA +2 -2
- {sglang-0.3.5.post1.dist-info → sglang-0.3.5.post2.dist-info}/RECORD +22 -21
- {sglang-0.3.5.post1.dist-info → sglang-0.3.5.post2.dist-info}/LICENSE +0 -0
- {sglang-0.3.5.post1.dist-info → sglang-0.3.5.post2.dist-info}/WHEEL +0 -0
- {sglang-0.3.5.post1.dist-info → sglang-0.3.5.post2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,309 @@
|
|
1
|
+
"""
|
2
|
+
Benchmark the throughput of using the offline LLM engine.
|
3
|
+
This script does not launch a server.
|
4
|
+
It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).
|
5
|
+
|
6
|
+
# Usage
|
7
|
+
## Sharegpt dataset with default args
|
8
|
+
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
|
9
|
+
|
10
|
+
## Random dataset with default args
|
11
|
+
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random
|
12
|
+
|
13
|
+
## Shared prefix dataset with default args
|
14
|
+
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name generated-shared-prefix
|
15
|
+
|
16
|
+
## Sharegpt dataset on runtime backend
|
17
|
+
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --backend runtime
|
18
|
+
"""
|
19
|
+
|
20
|
+
import argparse
|
21
|
+
import dataclasses
|
22
|
+
import json
|
23
|
+
import logging
|
24
|
+
import random
|
25
|
+
import time
|
26
|
+
from typing import List, Optional, Tuple
|
27
|
+
|
28
|
+
import numpy as np
|
29
|
+
|
30
|
+
from sglang.api import Engine
|
31
|
+
from sglang.bench_serving import (
|
32
|
+
get_dataset,
|
33
|
+
get_tokenizer,
|
34
|
+
sample_random_requests,
|
35
|
+
set_ulimit,
|
36
|
+
)
|
37
|
+
from sglang.srt.server import Runtime
|
38
|
+
from sglang.srt.server_args import ServerArgs
|
39
|
+
|
40
|
+
|
41
|
+
@dataclasses.dataclass
|
42
|
+
class BenchArgs:
|
43
|
+
backend: str = "engine"
|
44
|
+
result_filename: str = ""
|
45
|
+
dataset_name: str = "sharegpt"
|
46
|
+
dataset_path: str = ""
|
47
|
+
num_prompts: int = 1000
|
48
|
+
sharegpt_output_len: Optional[int] = None
|
49
|
+
random_input_len: int = 1024
|
50
|
+
random_output_len: int = 1024
|
51
|
+
random_range_ratio: float = 0.0
|
52
|
+
gen_num_groups: int = 64
|
53
|
+
gen_prompts_per_group: int = 16
|
54
|
+
gen_system_prompt_len: int = 2048
|
55
|
+
gen_question_len: int = 128
|
56
|
+
gen_output_len: int = 256
|
57
|
+
disable_ignore_eos: bool = False
|
58
|
+
seed: int = 1
|
59
|
+
|
60
|
+
@staticmethod
|
61
|
+
def add_cli_args(parser: argparse.ArgumentParser):
|
62
|
+
parser.add_argument("--backend", type=str, default=BenchArgs.backend)
|
63
|
+
parser.add_argument(
|
64
|
+
"--result-filename", type=str, default=BenchArgs.result_filename
|
65
|
+
)
|
66
|
+
parser.add_argument(
|
67
|
+
"--dataset-name",
|
68
|
+
type=str,
|
69
|
+
default="sharegpt",
|
70
|
+
choices=["sharegpt", "random", "generated-shared-prefix"],
|
71
|
+
help="Name of the dataset to benchmark on.",
|
72
|
+
)
|
73
|
+
parser.add_argument(
|
74
|
+
"--dataset-path", type=str, default="", help="Path to the dataset."
|
75
|
+
)
|
76
|
+
parser.add_argument(
|
77
|
+
"--num-prompts",
|
78
|
+
type=int,
|
79
|
+
default=BenchArgs.num_prompts,
|
80
|
+
help="Number of prompts to process. Default is 1000.",
|
81
|
+
)
|
82
|
+
parser.add_argument(
|
83
|
+
"--sharegpt-output-len",
|
84
|
+
type=int,
|
85
|
+
default=BenchArgs.sharegpt_output_len,
|
86
|
+
help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
|
87
|
+
)
|
88
|
+
parser.add_argument(
|
89
|
+
"--random-input-len",
|
90
|
+
type=int,
|
91
|
+
default=BenchArgs.random_input_len,
|
92
|
+
help="Number of input tokens per request, used only for random dataset.",
|
93
|
+
)
|
94
|
+
parser.add_argument(
|
95
|
+
"--random-output-len",
|
96
|
+
type=int,
|
97
|
+
default=BenchArgs.random_output_len,
|
98
|
+
help="Number of output tokens per request, used only for random dataset.",
|
99
|
+
)
|
100
|
+
parser.add_argument(
|
101
|
+
"--random-range-ratio",
|
102
|
+
type=float,
|
103
|
+
default=BenchArgs.random_range_ratio,
|
104
|
+
help="Range of sampled ratio of input/output length, "
|
105
|
+
"used only for random dataset.",
|
106
|
+
)
|
107
|
+
parser.add_argument(
|
108
|
+
"--gen-num-groups",
|
109
|
+
type=int,
|
110
|
+
default=BenchArgs.gen_num_groups,
|
111
|
+
help="Number of groups with shared prefix, used"
|
112
|
+
"only for generate-shared-prefix",
|
113
|
+
)
|
114
|
+
parser.add_argument(
|
115
|
+
"--gen-prompts-per-group",
|
116
|
+
type=int,
|
117
|
+
default=BenchArgs.gen_prompts_per_group,
|
118
|
+
help="Number of prompts per group of shared prefix, used"
|
119
|
+
"only for generate-shared-prefix",
|
120
|
+
)
|
121
|
+
parser.add_argument(
|
122
|
+
"--gen-system-prompt-len",
|
123
|
+
type=int,
|
124
|
+
default=BenchArgs.gen_system_prompt_len,
|
125
|
+
help="System prompt length, used" "only for generate-shared-prefix",
|
126
|
+
)
|
127
|
+
parser.add_argument(
|
128
|
+
"--gen-question-len",
|
129
|
+
type=int,
|
130
|
+
default=BenchArgs.gen_question_len,
|
131
|
+
help="Question length, used" "only for generate-shared-prefix",
|
132
|
+
)
|
133
|
+
parser.add_argument(
|
134
|
+
"--gen-output-len",
|
135
|
+
type=int,
|
136
|
+
default=BenchArgs.gen_output_len,
|
137
|
+
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
138
|
+
)
|
139
|
+
parser.add_argument(
|
140
|
+
"--disable-ignore-eos",
|
141
|
+
type=bool,
|
142
|
+
default=BenchArgs.disable_ignore_eos,
|
143
|
+
help="Disable ignore EOS token",
|
144
|
+
)
|
145
|
+
parser.add_argument("--seed", type=int, default=1, help="The random seed.")
|
146
|
+
|
147
|
+
@classmethod
|
148
|
+
def from_cli_args(cls, args: argparse.Namespace):
|
149
|
+
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
150
|
+
return cls(**{attr: getattr(args, attr) for attr in attrs})
|
151
|
+
|
152
|
+
|
153
|
+
def throughput_test_once(
|
154
|
+
backend_name: str,
|
155
|
+
backend,
|
156
|
+
reqs: List[Tuple[str, int, int]],
|
157
|
+
ignore_eos: bool,
|
158
|
+
):
|
159
|
+
measurement_results = {
|
160
|
+
"backend": backend_name,
|
161
|
+
"successful_requests": len(reqs),
|
162
|
+
"total_latency": -1,
|
163
|
+
"total_input_tokens": sum(r[1] for r in reqs),
|
164
|
+
"total_output_tokens": -1,
|
165
|
+
"request_throughput": -1,
|
166
|
+
"input_throughput": -1,
|
167
|
+
"output_throughput": -1,
|
168
|
+
"total_throughput": -1,
|
169
|
+
}
|
170
|
+
|
171
|
+
prompt = [r[0] for r in reqs]
|
172
|
+
sampling_params = [
|
173
|
+
{
|
174
|
+
"temperature": 0,
|
175
|
+
"max_new_tokens": r[2],
|
176
|
+
"ignore_eos": ignore_eos,
|
177
|
+
}
|
178
|
+
for r in reqs
|
179
|
+
]
|
180
|
+
|
181
|
+
st = time.perf_counter()
|
182
|
+
gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
|
183
|
+
latency = time.perf_counter() - st
|
184
|
+
|
185
|
+
if backend_name == "runtime":
|
186
|
+
gen_out = json.loads(gen_out)
|
187
|
+
|
188
|
+
measurement_results["total_latency"] = latency
|
189
|
+
measurement_results["total_output_tokens"] = sum(
|
190
|
+
o["meta_info"]["completion_tokens"] for o in gen_out
|
191
|
+
)
|
192
|
+
measurement_results["request_throughput"] = (
|
193
|
+
measurement_results["successful_requests"] / latency
|
194
|
+
)
|
195
|
+
measurement_results["input_throughput"] = (
|
196
|
+
measurement_results["total_input_tokens"] / latency
|
197
|
+
)
|
198
|
+
measurement_results["output_throughput"] = (
|
199
|
+
measurement_results["total_output_tokens"] / latency
|
200
|
+
)
|
201
|
+
measurement_results["total_throughput"] = (
|
202
|
+
measurement_results["total_input_tokens"]
|
203
|
+
+ measurement_results["total_output_tokens"]
|
204
|
+
) / latency
|
205
|
+
|
206
|
+
return measurement_results
|
207
|
+
|
208
|
+
|
209
|
+
def throughput_test(
|
210
|
+
server_args: ServerArgs,
|
211
|
+
bench_args: BenchArgs,
|
212
|
+
):
|
213
|
+
if bench_args.backend == "engine":
|
214
|
+
backend = Engine(**dataclasses.asdict(server_args))
|
215
|
+
if not backend:
|
216
|
+
raise ValueError("Please provide valid engine arguments")
|
217
|
+
elif bench_args.backend == "runtime":
|
218
|
+
backend = Runtime(**dataclasses.asdict(server_args))
|
219
|
+
else:
|
220
|
+
raise ValueError('Please set backend to either "engine" or "runtime"')
|
221
|
+
|
222
|
+
tokenizer_id = server_args.model_path
|
223
|
+
tokenizer = get_tokenizer(tokenizer_id)
|
224
|
+
|
225
|
+
# Set global environmnets
|
226
|
+
set_ulimit()
|
227
|
+
random.seed(bench_args.seed)
|
228
|
+
np.random.seed(bench_args.seed)
|
229
|
+
|
230
|
+
# Read dataset
|
231
|
+
input_requests = get_dataset(bench_args, tokenizer)
|
232
|
+
|
233
|
+
warmup_requests = sample_random_requests(
|
234
|
+
input_len=20,
|
235
|
+
output_len=4,
|
236
|
+
num_prompts=2,
|
237
|
+
range_ratio=0.8,
|
238
|
+
tokenizer=tokenizer,
|
239
|
+
dataset_path=bench_args.dataset_path,
|
240
|
+
)
|
241
|
+
|
242
|
+
# Warm up
|
243
|
+
throughput_test_once(
|
244
|
+
backend_name=bench_args.backend,
|
245
|
+
backend=backend,
|
246
|
+
reqs=warmup_requests,
|
247
|
+
ignore_eos=not bench_args.disable_ignore_eos,
|
248
|
+
)
|
249
|
+
|
250
|
+
result = throughput_test_once(
|
251
|
+
backend_name=bench_args.backend,
|
252
|
+
backend=backend,
|
253
|
+
reqs=input_requests,
|
254
|
+
ignore_eos=not bench_args.disable_ignore_eos,
|
255
|
+
)
|
256
|
+
|
257
|
+
if bench_args.result_filename:
|
258
|
+
with open(bench_args.result_filename, "a") as fout:
|
259
|
+
fout.write(json.dumps(result) + "\n")
|
260
|
+
|
261
|
+
print(
|
262
|
+
"\n{s:{c}^{n}}".format(s=" Offline Throughput Benchmark Result ", n=50, c="=")
|
263
|
+
)
|
264
|
+
print("{:<40} {:<10}".format("Backend:", result["backend"]))
|
265
|
+
print("{:<40} {:<10}".format("Successful requests:", result["successful_requests"]))
|
266
|
+
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", result["total_latency"]))
|
267
|
+
print("{:<40} {:<10}".format("Total input tokens:", result["total_input_tokens"]))
|
268
|
+
print(
|
269
|
+
"{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"])
|
270
|
+
)
|
271
|
+
print(
|
272
|
+
"{:<40} {:<10.2f}".format(
|
273
|
+
"Request throughput (req/s):", result["request_throughput"]
|
274
|
+
)
|
275
|
+
)
|
276
|
+
print(
|
277
|
+
"{:<40} {:<10.2f}".format(
|
278
|
+
"Input token throughput (tok/s):", result["input_throughput"]
|
279
|
+
)
|
280
|
+
)
|
281
|
+
print(
|
282
|
+
"{:<40} {:<10.2f}".format(
|
283
|
+
"Output token throughput (tok/s):", result["output_throughput"]
|
284
|
+
)
|
285
|
+
)
|
286
|
+
print(
|
287
|
+
"{:<40} {:<10.2f}".format(
|
288
|
+
"Total token throughput (tok/s):", result["total_throughput"]
|
289
|
+
)
|
290
|
+
)
|
291
|
+
print("=" * 50)
|
292
|
+
|
293
|
+
return result
|
294
|
+
|
295
|
+
|
296
|
+
if __name__ == "__main__":
|
297
|
+
parser = argparse.ArgumentParser()
|
298
|
+
ServerArgs.add_cli_args(parser)
|
299
|
+
BenchArgs.add_cli_args(parser)
|
300
|
+
args = parser.parse_args()
|
301
|
+
server_args = ServerArgs.from_cli_args(args)
|
302
|
+
bench_args = BenchArgs.from_cli_args(args)
|
303
|
+
|
304
|
+
logging.basicConfig(
|
305
|
+
level=getattr(logging, server_args.log_level.upper()),
|
306
|
+
format="%(message)s",
|
307
|
+
)
|
308
|
+
|
309
|
+
throughput_test(server_args, bench_args)
|
sglang/bench_serving.py
CHANGED
@@ -421,6 +421,37 @@ def get_tokenizer(
|
|
421
421
|
)
|
422
422
|
|
423
423
|
|
424
|
+
def get_dataset(args, tokenizer):
|
425
|
+
if args.dataset_name == "sharegpt":
|
426
|
+
input_requests = sample_sharegpt_requests(
|
427
|
+
dataset_path=args.dataset_path,
|
428
|
+
num_requests=args.num_prompts,
|
429
|
+
tokenizer=tokenizer,
|
430
|
+
fixed_output_len=args.sharegpt_output_len,
|
431
|
+
)
|
432
|
+
elif args.dataset_name == "random":
|
433
|
+
input_requests = sample_random_requests(
|
434
|
+
input_len=args.random_input_len,
|
435
|
+
output_len=args.random_output_len,
|
436
|
+
num_prompts=args.num_prompts,
|
437
|
+
range_ratio=args.random_range_ratio,
|
438
|
+
tokenizer=tokenizer,
|
439
|
+
dataset_path=args.dataset_path,
|
440
|
+
)
|
441
|
+
elif args.dataset_name == "generated-shared-prefix":
|
442
|
+
input_requests = sample_generated_shared_prefix_requests(
|
443
|
+
num_groups=args.gen_num_groups,
|
444
|
+
prompts_per_group=args.gen_prompts_per_group,
|
445
|
+
system_prompt_len=args.gen_system_prompt_len,
|
446
|
+
question_len=args.gen_question_len,
|
447
|
+
output_len=args.gen_output_len,
|
448
|
+
tokenizer=tokenizer,
|
449
|
+
)
|
450
|
+
else:
|
451
|
+
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
452
|
+
return input_requests
|
453
|
+
|
454
|
+
|
424
455
|
ASYNC_REQUEST_FUNCS = {
|
425
456
|
"sglang": async_request_sglang_generate,
|
426
457
|
"sglang-native": async_request_sglang_generate,
|
@@ -443,6 +474,8 @@ class BenchmarkMetrics:
|
|
443
474
|
input_throughput: float
|
444
475
|
output_throughput: float
|
445
476
|
output_throughput_retokenized: float
|
477
|
+
total_throughput: float
|
478
|
+
total_throughput_retokenized: float
|
446
479
|
mean_ttft_ms: float
|
447
480
|
median_ttft_ms: float
|
448
481
|
std_ttft_ms: float
|
@@ -590,7 +623,6 @@ def sample_random_requests(
|
|
590
623
|
(data["conversations"][0]["value"], data["conversations"][1]["value"])
|
591
624
|
for data in dataset
|
592
625
|
]
|
593
|
-
|
594
626
|
# Shuffle the dataset.
|
595
627
|
random.shuffle(dataset)
|
596
628
|
|
@@ -764,6 +796,9 @@ def calculate_metrics(
|
|
764
796
|
input_throughput=total_input / dur_s,
|
765
797
|
output_throughput=sum(output_lens) / dur_s,
|
766
798
|
output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
|
799
|
+
total_throughput=(total_input + sum(output_lens)) / dur_s,
|
800
|
+
total_throughput_retokenized=(total_input + sum(retokenized_output_lens))
|
801
|
+
/ dur_s,
|
767
802
|
mean_ttft_ms=np.mean(ttfts or 0)
|
768
803
|
* 1000, # ttfts is empty if streaming is not supported by backend
|
769
804
|
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
@@ -881,6 +916,11 @@ async def benchmark(
|
|
881
916
|
"Output token throughput (tok/s):", metrics.output_throughput
|
882
917
|
)
|
883
918
|
)
|
919
|
+
print(
|
920
|
+
"{:<40} {:<10.2f}".format(
|
921
|
+
"Total token throughput (tok/s):", metrics.total_throughput
|
922
|
+
)
|
923
|
+
)
|
884
924
|
print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
|
885
925
|
print(
|
886
926
|
"{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
|
@@ -1098,35 +1138,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1098
1138
|
|
1099
1139
|
tokenizer = get_tokenizer(tokenizer_id)
|
1100
1140
|
|
1101
|
-
|
1102
|
-
assert args.random_input_len is None and args.random_output_len is None
|
1103
|
-
input_requests = sample_sharegpt_requests(
|
1104
|
-
dataset_path=args.dataset_path,
|
1105
|
-
num_requests=args.num_prompts,
|
1106
|
-
tokenizer=tokenizer,
|
1107
|
-
fixed_output_len=args.sharegpt_output_len,
|
1108
|
-
)
|
1109
|
-
elif args.dataset_name == "random":
|
1110
|
-
assert args.random_input_len is not None and args.random_output_len is not None
|
1111
|
-
input_requests = sample_random_requests(
|
1112
|
-
input_len=args.random_input_len,
|
1113
|
-
output_len=args.random_output_len,
|
1114
|
-
num_prompts=args.num_prompts,
|
1115
|
-
range_ratio=args.random_range_ratio,
|
1116
|
-
tokenizer=tokenizer,
|
1117
|
-
dataset_path=args.dataset_path,
|
1118
|
-
)
|
1119
|
-
elif args.dataset_name == "generated-shared-prefix":
|
1120
|
-
input_requests = sample_generated_shared_prefix_requests(
|
1121
|
-
num_groups=args.gen_num_groups,
|
1122
|
-
prompts_per_group=args.gen_prompts_per_group,
|
1123
|
-
system_prompt_len=args.gen_system_prompt_len,
|
1124
|
-
question_len=args.gen_question_len,
|
1125
|
-
output_len=args.gen_output_len,
|
1126
|
-
tokenizer=tokenizer,
|
1127
|
-
)
|
1128
|
-
else:
|
1129
|
-
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
1141
|
+
input_requests = get_dataset(args, tokenizer)
|
1130
1142
|
|
1131
1143
|
if not args.multi:
|
1132
1144
|
return asyncio.run(
|
@@ -1229,10 +1241,12 @@ if __name__ == "__main__":
|
|
1229
1241
|
parser.add_argument(
|
1230
1242
|
"--random-input-len",
|
1231
1243
|
type=int,
|
1244
|
+
default=1024,
|
1232
1245
|
help="Number of input tokens per request, used only for random dataset.",
|
1233
1246
|
)
|
1234
1247
|
parser.add_argument(
|
1235
1248
|
"--random-output-len",
|
1249
|
+
default=1024,
|
1236
1250
|
type=int,
|
1237
1251
|
help="Number of output tokens per request, used only for random dataset.",
|
1238
1252
|
)
|
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
|
13
13
|
limitations under the License.
|
14
14
|
"""
|
15
15
|
|
16
|
-
"""The baseclass of
|
16
|
+
"""The baseclass of a backend for grammar-guided constrained decoding."""
|
17
17
|
|
18
18
|
from concurrent.futures import Future, ThreadPoolExecutor
|
19
19
|
from dataclasses import dataclass
|
@@ -52,7 +52,7 @@ class BaseGrammarBackend:
|
|
52
52
|
else:
|
53
53
|
entry.value = self.init_value_impl(key)
|
54
54
|
entry.event.set()
|
55
|
-
return entry.value.copy()
|
55
|
+
return entry.value.copy() if entry.value else None
|
56
56
|
|
57
57
|
def init_value_impl(self, key: Tuple[str, str]) -> BaseGrammarObject:
|
58
58
|
raise NotImplementedError()
|
@@ -62,7 +62,8 @@ class BaseGrammarBackend:
|
|
62
62
|
entry = self.cache.get(key)
|
63
63
|
if not entry or not entry.event.is_set():
|
64
64
|
return None
|
65
|
-
|
65
|
+
val = self.cache[key].value
|
66
|
+
return val.copy() if val else None
|
66
67
|
|
67
68
|
def get_future_value(self, key: Tuple[str, str]) -> Future:
|
68
69
|
return self.executor.submit(self.init_value, key)
|
@@ -19,9 +19,12 @@ import json
|
|
19
19
|
import logging
|
20
20
|
from typing import Dict, List, Optional, Tuple, Union
|
21
21
|
|
22
|
+
import interegular
|
22
23
|
import torch
|
23
24
|
from outlines.fsm.guide import RegexGuide
|
25
|
+
from outlines.fsm.json_schema import build_regex_from_schema
|
24
26
|
from outlines.models.transformers import TransformerTokenizer
|
27
|
+
from pydantic import BaseModel
|
25
28
|
|
26
29
|
from sglang.srt.constrained.base_grammar_backend import (
|
27
30
|
BaseGrammarBackend,
|
@@ -32,26 +35,6 @@ from sglang.srt.constrained.outlines_jump_forward import OutlinesJumpForwardMap
|
|
32
35
|
logger = logging.getLogger(__name__)
|
33
36
|
|
34
37
|
|
35
|
-
try:
|
36
|
-
from outlines.fsm.json_schema import build_regex_from_object
|
37
|
-
except ImportError:
|
38
|
-
# Since outlines 0.0.32, build_regex_from_object is replaced by build_regex_from_schema,
|
39
|
-
# which only accepts string schema as input.
|
40
|
-
from outlines.fsm.json_schema import build_regex_from_schema
|
41
|
-
from pydantic import BaseModel
|
42
|
-
|
43
|
-
def build_regex_from_object(
|
44
|
-
object: Union[str, BaseModel, Dict], whitespace_pattern: Optional[str] = None
|
45
|
-
):
|
46
|
-
if isinstance(object, type(BaseModel)):
|
47
|
-
schema = json.dumps(object.model_json_schema())
|
48
|
-
elif isinstance(object, Dict):
|
49
|
-
schema = json.dumps(object)
|
50
|
-
else:
|
51
|
-
schema = object
|
52
|
-
return build_regex_from_schema(schema, whitespace_pattern)
|
53
|
-
|
54
|
-
|
55
38
|
class OutlinesGrammar(BaseGrammarObject):
|
56
39
|
def __init__(
|
57
40
|
self,
|
@@ -147,19 +130,36 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
|
|
147
130
|
key_string,
|
148
131
|
whitespace_pattern=self.whitespace_pattern,
|
149
132
|
)
|
150
|
-
except NotImplementedError as e:
|
133
|
+
except (NotImplementedError, json.decoder.JSONDecodeError) as e:
|
151
134
|
logger.warning(
|
152
|
-
f"
|
135
|
+
f"Skip invalid json_schema: json_schema={key_string}, {e=}"
|
153
136
|
)
|
154
|
-
return None
|
137
|
+
return None
|
155
138
|
elif key_type == "regex":
|
156
139
|
regex = key_string
|
157
140
|
else:
|
158
141
|
raise ValueError(f"Invalid key_type: {key_type}")
|
159
142
|
|
160
|
-
|
143
|
+
try:
|
144
|
+
guide = RegexGuide(regex, self.outlines_tokenizer)
|
145
|
+
except interegular.patterns.InvalidSyntax as e:
|
146
|
+
logger.warning(f"skip invalid regex schema: {regex=}, {e=}")
|
147
|
+
return None
|
148
|
+
|
161
149
|
if self.allow_jump_forward:
|
162
150
|
jump_forward_map = OutlinesJumpForwardMap(regex)
|
163
151
|
else:
|
164
152
|
jump_forward_map = None
|
165
153
|
return OutlinesGrammar(guide, jump_forward_map)
|
154
|
+
|
155
|
+
|
156
|
+
def build_regex_from_object(
|
157
|
+
object: Union[str, BaseModel, Dict], whitespace_pattern: Optional[str] = None
|
158
|
+
):
|
159
|
+
if isinstance(object, type(BaseModel)):
|
160
|
+
schema = json.dumps(object.model_json_schema())
|
161
|
+
elif isinstance(object, Dict):
|
162
|
+
schema = json.dumps(object)
|
163
|
+
else:
|
164
|
+
schema = object
|
165
|
+
return build_regex_from_schema(schema, whitespace_pattern)
|
@@ -15,16 +15,29 @@ limitations under the License.
|
|
15
15
|
|
16
16
|
"""Constrained decoding with xgrammar backend."""
|
17
17
|
|
18
|
+
import logging
|
18
19
|
from typing import List, Tuple
|
19
20
|
|
20
21
|
import torch
|
21
|
-
|
22
|
+
|
23
|
+
try:
|
24
|
+
from xgrammar import CachedGrammarCompiler, CompiledGrammar, GrammarMatcher
|
25
|
+
|
26
|
+
import_error = None
|
27
|
+
except ImportError as e:
|
28
|
+
CachedGrammarCompiler = CompiledGrammar = GrammarMatcher = TokenizerInfo = (
|
29
|
+
ImportError
|
30
|
+
)
|
31
|
+
import_error = e
|
22
32
|
|
23
33
|
from sglang.srt.constrained.base_grammar_backend import (
|
24
34
|
BaseGrammarBackend,
|
25
35
|
BaseGrammarObject,
|
26
36
|
)
|
27
37
|
|
38
|
+
logger = logging.getLogger(__name__)
|
39
|
+
|
40
|
+
|
28
41
|
MAX_ROLLBACK_TOKENS = 10
|
29
42
|
|
30
43
|
|
@@ -91,15 +104,37 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
|
|
91
104
|
vocab_size: int,
|
92
105
|
):
|
93
106
|
super().__init__()
|
107
|
+
|
108
|
+
if import_error:
|
109
|
+
logger.warning(
|
110
|
+
f"Ignore import error for the grammar backend: {import_error}"
|
111
|
+
)
|
112
|
+
self.grammar_cache = None
|
113
|
+
return
|
114
|
+
|
94
115
|
self.grammar_cache = CachedGrammarCompiler(tokenizer_or_vocab=tokenizer)
|
95
116
|
self.vocab_size = vocab_size
|
96
117
|
|
97
118
|
def init_value_impl(self, key: Tuple[str, str]) -> XGrammarGrammar:
|
119
|
+
if import_error:
|
120
|
+
raise import_error
|
121
|
+
|
98
122
|
key_type, key_string = key
|
99
123
|
if key_type == "json":
|
100
|
-
|
124
|
+
try:
|
125
|
+
ctx = self.grammar_cache.get_compiled_grammar_for_json_schema(
|
126
|
+
key_string
|
127
|
+
)
|
128
|
+
except RuntimeError as e:
|
129
|
+
logging.warning(
|
130
|
+
f"Skip invalid json_schema: json_schema={key_string}, {e=}"
|
131
|
+
)
|
132
|
+
return None
|
101
133
|
elif key_type == "regex":
|
102
|
-
|
134
|
+
logger.warning(
|
135
|
+
"regex hasn't been supported by xgrammar yet. This is skipped."
|
136
|
+
)
|
137
|
+
return None
|
103
138
|
else:
|
104
139
|
raise ValueError(f"Invalid key_type: {key_type}")
|
105
140
|
|
@@ -111,4 +146,5 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
|
|
111
146
|
return XGrammarGrammar(matcher, self.vocab_size, ctx)
|
112
147
|
|
113
148
|
def reset(self):
|
114
|
-
self.grammar_cache
|
149
|
+
if self.grammar_cache:
|
150
|
+
self.grammar_cache.clear()
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import Optional
|
1
|
+
from typing import Callable, Optional
|
2
2
|
|
3
3
|
import torch
|
4
4
|
from torch.nn import functional as F
|
@@ -98,7 +98,9 @@ def fused_moe_forward_native(
|
|
98
98
|
renormalize: bool,
|
99
99
|
topk_group: Optional[int] = None,
|
100
100
|
num_expert_group: Optional[int] = None,
|
101
|
+
custom_routing_function: Optional[Callable] = None,
|
101
102
|
) -> torch.Tensor:
|
103
|
+
assert custom_routing_function is None
|
102
104
|
topk_weights, topk_ids = select_experts_native(
|
103
105
|
hidden_states=x,
|
104
106
|
router_logits=router_logits,
|
@@ -114,4 +116,4 @@ def fused_moe_forward_native(
|
|
114
116
|
x1 = F.silu(torch.einsum("ti,taoi -> tao", x, w1_weights))
|
115
117
|
x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
|
116
118
|
expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
|
117
|
-
return torch.einsum("tai,ta -> ti", expert_outs, topk_weights)
|
119
|
+
return torch.einsum("tai,ta -> ti", expert_outs, topk_weights.to(expert_outs.dtype))
|
@@ -100,20 +100,6 @@ class DetokenizerManager:
|
|
100
100
|
|
101
101
|
if isinstance(recv_obj, BatchEmbeddingOut):
|
102
102
|
# If it is embedding model, no detokenization is needed.
|
103
|
-
self.send_to_tokenizer.send_pyobj(
|
104
|
-
BatchEmbeddingOut(
|
105
|
-
rids=recv_obj.rids,
|
106
|
-
embeddings=recv_obj.embeddings,
|
107
|
-
meta_info=recv_obj.meta_info,
|
108
|
-
finished_reason=recv_obj.finished_reason,
|
109
|
-
)
|
110
|
-
)
|
111
|
-
continue
|
112
|
-
elif isinstance(recv_obj, UpdateWeightReqOutput):
|
113
|
-
# If it is a weight update request, no detokenization is needed.
|
114
|
-
self.send_to_tokenizer.send_pyobj(recv_obj)
|
115
|
-
continue
|
116
|
-
elif isinstance(recv_obj, GetMemPoolSizeReqOutput):
|
117
103
|
self.send_to_tokenizer.send_pyobj(recv_obj)
|
118
104
|
continue
|
119
105
|
else:
|
sglang/srt/managers/scheduler.py
CHANGED
@@ -114,6 +114,9 @@ class Scheduler:
|
|
114
114
|
self.recv_from_tokenizer = get_zmq_socket(
|
115
115
|
context, zmq.PULL, port_args.scheduler_input_ipc_name
|
116
116
|
)
|
117
|
+
self.send_to_tokenizer = get_zmq_socket(
|
118
|
+
context, zmq.PUSH, port_args.tokenizer_ipc_name
|
119
|
+
)
|
117
120
|
|
118
121
|
if server_args.skip_tokenizer_init:
|
119
122
|
# Directly send to the tokenizer/api
|
@@ -127,6 +130,7 @@ class Scheduler:
|
|
127
130
|
)
|
128
131
|
else:
|
129
132
|
self.recv_from_tokenizer = None
|
133
|
+
self.send_to_tokenizer = SimpleNamespace(send_pyobj=lambda x: None)
|
130
134
|
self.send_to_detokenizer = SimpleNamespace(send_pyobj=lambda x: None)
|
131
135
|
|
132
136
|
# Init tokenizer
|
@@ -421,7 +425,7 @@ class Scheduler:
|
|
421
425
|
self.abort_request(recv_req)
|
422
426
|
elif isinstance(recv_req, UpdateWeightReqInput):
|
423
427
|
success, message = self.update_weights(recv_req)
|
424
|
-
self.
|
428
|
+
self.send_to_tokenizer.send_pyobj(
|
425
429
|
UpdateWeightReqOutput(success, message)
|
426
430
|
)
|
427
431
|
elif isinstance(recv_req, ProfileReq):
|
@@ -430,7 +434,7 @@ class Scheduler:
|
|
430
434
|
else:
|
431
435
|
self.stop_profile()
|
432
436
|
elif isinstance(recv_req, GetMemPoolSizeReq):
|
433
|
-
self.
|
437
|
+
self.send_to_tokenizer.send_pyobj(
|
434
438
|
GetMemPoolSizeReqOutput(self.max_total_num_tokens)
|
435
439
|
)
|
436
440
|
else:
|
@@ -233,7 +233,10 @@ class ModelRunner:
|
|
233
233
|
|
234
234
|
# Prepare the vllm model config
|
235
235
|
monkey_patch_vllm_dummy_weight_loader()
|
236
|
-
self.load_config = LoadConfig(
|
236
|
+
self.load_config = LoadConfig(
|
237
|
+
load_format=self.server_args.load_format,
|
238
|
+
download_dir=self.server_args.download_dir,
|
239
|
+
)
|
237
240
|
self.vllm_model_config = VllmModelConfig(
|
238
241
|
model=self.server_args.model_path,
|
239
242
|
quantization=self.server_args.quantization,
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -516,8 +516,9 @@ def v1_generate_request(
|
|
516
516
|
"regex": request.regex,
|
517
517
|
"json_schema": request.json_schema,
|
518
518
|
"n": request.n,
|
519
|
-
"ignore_eos": request.ignore_eos,
|
520
519
|
"no_stop_trim": request.no_stop_trim,
|
520
|
+
"ignore_eos": request.ignore_eos,
|
521
|
+
"skip_special_tokens": request.skip_special_tokens,
|
521
522
|
}
|
522
523
|
)
|
523
524
|
return_logprobs.append(request.logprobs is not None and request.logprobs > 0)
|
@@ -928,7 +929,9 @@ def v1_chat_generate_request(
|
|
928
929
|
"repetition_penalty": request.repetition_penalty,
|
929
930
|
"regex": request.regex,
|
930
931
|
"n": request.n,
|
932
|
+
"no_stop_trim": request.no_stop_trim,
|
931
933
|
"ignore_eos": request.ignore_eos,
|
934
|
+
"skip_special_tokens": request.skip_special_tokens,
|
932
935
|
}
|
933
936
|
if request.response_format and request.response_format.type == "json_schema":
|
934
937
|
sampling_params["json_schema"] = convert_json_schema_to_str(
|
@@ -1166,7 +1169,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1166
1169
|
is_first = False
|
1167
1170
|
choice_data = ChatCompletionResponseStreamChoice(
|
1168
1171
|
index=index,
|
1169
|
-
delta=DeltaMessage(role="assistant"),
|
1172
|
+
delta=DeltaMessage(role="assistant", content=""),
|
1170
1173
|
finish_reason=(
|
1171
1174
|
finish_reason["type"] if finish_reason else ""
|
1172
1175
|
),
|
@@ -36,7 +36,7 @@ class ModelList(BaseModel):
|
|
36
36
|
"""Model list consists of model cards."""
|
37
37
|
|
38
38
|
object: str = "list"
|
39
|
-
data: List[ModelCard] =
|
39
|
+
data: List[ModelCard] = Field(default_factory=list)
|
40
40
|
|
41
41
|
|
42
42
|
class ErrorResponse(BaseModel):
|
@@ -143,7 +143,7 @@ class BatchResponse(BaseModel):
|
|
143
143
|
expired_at: Optional[int] = None
|
144
144
|
cancelling_at: Optional[int] = None
|
145
145
|
cancelled_at: Optional[int] = None
|
146
|
-
request_counts: dict =
|
146
|
+
request_counts: Optional[dict] = None
|
147
147
|
metadata: Optional[dict] = None
|
148
148
|
|
149
149
|
|
@@ -153,30 +153,31 @@ class CompletionRequest(BaseModel):
|
|
153
153
|
model: str
|
154
154
|
prompt: Union[List[int], List[List[int]], str, List[str]]
|
155
155
|
best_of: Optional[int] = None
|
156
|
-
echo:
|
157
|
-
frequency_penalty:
|
156
|
+
echo: bool = False
|
157
|
+
frequency_penalty: float = 0.0
|
158
158
|
logit_bias: Optional[Dict[str, float]] = None
|
159
159
|
logprobs: Optional[int] = None
|
160
|
-
max_tokens:
|
160
|
+
max_tokens: int = 16
|
161
161
|
n: int = 1
|
162
|
-
presence_penalty:
|
162
|
+
presence_penalty: float = 0.0
|
163
163
|
seed: Optional[int] = None
|
164
|
-
stop: Optional[Union[str, List[str]]] =
|
165
|
-
stream:
|
164
|
+
stop: Optional[Union[str, List[str]]] = None
|
165
|
+
stream: bool = False
|
166
166
|
stream_options: Optional[StreamOptions] = None
|
167
167
|
suffix: Optional[str] = None
|
168
|
-
temperature:
|
169
|
-
top_p:
|
168
|
+
temperature: float = 1.0
|
169
|
+
top_p: float = 1.0
|
170
170
|
user: Optional[str] = None
|
171
171
|
|
172
172
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
173
|
-
regex: Optional[str] = None
|
174
173
|
json_schema: Optional[str] = None
|
175
|
-
|
174
|
+
regex: Optional[str] = None
|
176
175
|
min_tokens: int = 0
|
177
|
-
repetition_penalty:
|
178
|
-
stop_token_ids: Optional[List[int]] =
|
179
|
-
no_stop_trim:
|
176
|
+
repetition_penalty: float = 1.0
|
177
|
+
stop_token_ids: Optional[List[int]] = None
|
178
|
+
no_stop_trim: bool = False
|
179
|
+
ignore_eos: bool = False
|
180
|
+
skip_special_tokens: bool = True
|
180
181
|
|
181
182
|
|
182
183
|
class CompletionResponseChoice(BaseModel):
|
@@ -259,28 +260,30 @@ class ChatCompletionRequest(BaseModel):
|
|
259
260
|
# https://platform.openai.com/docs/api-reference/chat/create
|
260
261
|
messages: List[ChatCompletionMessageParam]
|
261
262
|
model: str
|
262
|
-
frequency_penalty:
|
263
|
+
frequency_penalty: float = 0.0
|
263
264
|
logit_bias: Optional[Dict[str, float]] = None
|
264
|
-
logprobs:
|
265
|
+
logprobs: bool = False
|
265
266
|
top_logprobs: Optional[int] = None
|
266
267
|
max_tokens: Optional[int] = None
|
267
|
-
n:
|
268
|
-
presence_penalty:
|
268
|
+
n: int = 1
|
269
|
+
presence_penalty: float = 0.0
|
269
270
|
response_format: Optional[ResponseFormat] = None
|
270
271
|
seed: Optional[int] = None
|
271
|
-
stop: Optional[Union[str, List[str]]] =
|
272
|
-
stream:
|
272
|
+
stop: Optional[Union[str, List[str]]] = None
|
273
|
+
stream: bool = False
|
273
274
|
stream_options: Optional[StreamOptions] = None
|
274
|
-
temperature:
|
275
|
-
top_p:
|
275
|
+
temperature: float = 0.7
|
276
|
+
top_p: float = 1.0
|
276
277
|
user: Optional[str] = None
|
277
278
|
|
278
279
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
279
280
|
regex: Optional[str] = None
|
280
|
-
min_tokens:
|
281
|
-
repetition_penalty:
|
282
|
-
stop_token_ids: Optional[List[int]] =
|
281
|
+
min_tokens: int = 0
|
282
|
+
repetition_penalty: float = 1.0
|
283
|
+
stop_token_ids: Optional[List[int]] = None
|
284
|
+
no_stop_trim: bool = False
|
283
285
|
ignore_eos: bool = False
|
286
|
+
skip_special_tokens: bool = True
|
284
287
|
|
285
288
|
|
286
289
|
class ChatMessage(BaseModel):
|
@@ -34,13 +34,13 @@ class SamplingParams:
|
|
34
34
|
frequency_penalty: float = 0.0,
|
35
35
|
presence_penalty: float = 0.0,
|
36
36
|
repetition_penalty: float = 1.0,
|
37
|
-
ignore_eos: bool = False,
|
38
|
-
skip_special_tokens: bool = True,
|
39
37
|
spaces_between_special_tokens: bool = True,
|
40
38
|
regex: Optional[str] = None,
|
41
39
|
n: int = 1,
|
42
40
|
json_schema: Optional[str] = None,
|
43
41
|
no_stop_trim: bool = False,
|
42
|
+
ignore_eos: bool = False,
|
43
|
+
skip_special_tokens: bool = True,
|
44
44
|
) -> None:
|
45
45
|
self.temperature = temperature
|
46
46
|
self.top_p = top_p
|
sglang/srt/server.py
CHANGED
@@ -139,6 +139,7 @@ async def get_model_info():
|
|
139
139
|
"""Get the model information."""
|
140
140
|
result = {
|
141
141
|
"model_path": tokenizer_manager.model_path,
|
142
|
+
"tokenizer_path": tokenizer_manager.server_args.tokenizer_path,
|
142
143
|
"is_generation": tokenizer_manager.is_generation,
|
143
144
|
}
|
144
145
|
return result
|
@@ -768,7 +769,7 @@ class Engine:
|
|
768
769
|
self,
|
769
770
|
# The input prompt. It can be a single prompt or a batch of prompts.
|
770
771
|
prompt: Optional[Union[List[str], str]] = None,
|
771
|
-
sampling_params: Optional[Dict] = None,
|
772
|
+
sampling_params: Optional[Union[List[Dict], Dict]] = None,
|
772
773
|
# The token ids for text; one can either specify text or input_ids.
|
773
774
|
input_ids: Optional[Union[List[List[int]], List[int]]] = None,
|
774
775
|
return_logprob: Optional[Union[List[bool], bool]] = False,
|
sglang/srt/server_args.py
CHANGED
@@ -22,7 +22,12 @@ import random
|
|
22
22
|
import tempfile
|
23
23
|
from typing import List, Optional
|
24
24
|
|
25
|
-
from sglang.srt.utils import
|
25
|
+
from sglang.srt.utils import (
|
26
|
+
get_gpu_memory_capacity,
|
27
|
+
is_flashinfer_available,
|
28
|
+
is_ipv6,
|
29
|
+
is_port_available,
|
30
|
+
)
|
26
31
|
|
27
32
|
logger = logging.getLogger(__name__)
|
28
33
|
|
@@ -64,6 +69,7 @@ class ServerArgs:
|
|
64
69
|
random_seed: Optional[int] = None
|
65
70
|
constrained_json_whitespace_pattern: Optional[str] = None
|
66
71
|
watchdog_timeout: float = 300
|
72
|
+
download_dir: Optional[str] = None
|
67
73
|
|
68
74
|
# Logging
|
69
75
|
log_level: str = "info"
|
@@ -142,6 +148,9 @@ class ServerArgs:
|
|
142
148
|
# Disable chunked prefill
|
143
149
|
self.chunked_prefill_size = None
|
144
150
|
|
151
|
+
if self.random_seed is None:
|
152
|
+
self.random_seed = random.randint(0, 1 << 30)
|
153
|
+
|
145
154
|
# Mem fraction depends on the tensor parallelism size
|
146
155
|
if self.mem_fraction_static is None:
|
147
156
|
if self.tp_size >= 16:
|
@@ -155,8 +164,14 @@ class ServerArgs:
|
|
155
164
|
else:
|
156
165
|
self.mem_fraction_static = 0.88
|
157
166
|
|
158
|
-
|
159
|
-
|
167
|
+
# Adjust for GPUs with small memory capacities
|
168
|
+
gpu_mem = get_gpu_memory_capacity()
|
169
|
+
if gpu_mem < 25000:
|
170
|
+
logger.warning(
|
171
|
+
"Automatically adjust --chunked-prefill-size for small GPUs."
|
172
|
+
)
|
173
|
+
self.chunked_prefill_size //= 4 # make it 2048
|
174
|
+
self.cuda_graph_max_bs = 4
|
160
175
|
|
161
176
|
# Deprecation warnings
|
162
177
|
if self.disable_flashinfer:
|
@@ -405,6 +420,12 @@ class ServerArgs:
|
|
405
420
|
default=ServerArgs.watchdog_timeout,
|
406
421
|
help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
|
407
422
|
)
|
423
|
+
parser.add_argument(
|
424
|
+
"--download-dir",
|
425
|
+
type=str,
|
426
|
+
default=ServerArgs.download_dir,
|
427
|
+
help="Model download directory.",
|
428
|
+
)
|
408
429
|
|
409
430
|
# Logging
|
410
431
|
parser.add_argument(
|
sglang/srt/utils.py
CHANGED
@@ -27,6 +27,7 @@ import resource
|
|
27
27
|
import shutil
|
28
28
|
import signal
|
29
29
|
import socket
|
30
|
+
import subprocess
|
30
31
|
import tempfile
|
31
32
|
import time
|
32
33
|
import warnings
|
@@ -791,3 +792,35 @@ def add_prometheus_middleware(app):
|
|
791
792
|
# Workaround for 307 Redirect for /metrics
|
792
793
|
metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
|
793
794
|
app.routes.append(metrics_route)
|
795
|
+
|
796
|
+
|
797
|
+
def get_gpu_memory_capacity():
|
798
|
+
try:
|
799
|
+
# Run nvidia-smi and capture the output
|
800
|
+
result = subprocess.run(
|
801
|
+
["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits"],
|
802
|
+
stdout=subprocess.PIPE,
|
803
|
+
stderr=subprocess.PIPE,
|
804
|
+
text=True,
|
805
|
+
)
|
806
|
+
|
807
|
+
if result.returncode != 0:
|
808
|
+
raise RuntimeError(f"nvidia-smi error: {result.stderr.strip()}")
|
809
|
+
|
810
|
+
# Parse the output to extract memory values
|
811
|
+
memory_values = [
|
812
|
+
float(mem)
|
813
|
+
for mem in result.stdout.strip().split("\n")
|
814
|
+
if re.match(r"^\d+(\.\d+)?$", mem.strip())
|
815
|
+
]
|
816
|
+
|
817
|
+
if not memory_values:
|
818
|
+
raise ValueError("No GPU memory values found.")
|
819
|
+
|
820
|
+
# Return the minimum memory value
|
821
|
+
return min(memory_values)
|
822
|
+
|
823
|
+
except FileNotFoundError:
|
824
|
+
raise RuntimeError(
|
825
|
+
"nvidia-smi not found. Ensure NVIDIA drivers are installed and accessible."
|
826
|
+
)
|
sglang/test/test_utils.py
CHANGED
@@ -28,8 +28,9 @@ from sglang.utils import get_exception_traceback
|
|
28
28
|
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
|
29
29
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
30
30
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
31
|
-
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
32
31
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
32
|
+
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
|
33
|
+
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
33
34
|
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
34
35
|
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
35
36
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
@@ -672,7 +673,7 @@ def run_and_check_memory_leak(
|
|
672
673
|
if enable_mixed_chunk:
|
673
674
|
other_args += ["--enable-mixed-chunk"]
|
674
675
|
if enable_overlap:
|
675
|
-
other_args += ["--enable-overlap-
|
676
|
+
other_args += ["--enable-overlap-schedule"]
|
676
677
|
|
677
678
|
model = DEFAULT_MODEL_NAME_FOR_TEST
|
678
679
|
port = random.randint(4000, 5000)
|
@@ -739,8 +740,7 @@ def run_mmlu_test(
|
|
739
740
|
|
740
741
|
try:
|
741
742
|
metrics = run_eval(args)
|
742
|
-
|
743
|
-
assert metrics["score"] >= 0.65
|
743
|
+
assert metrics["score"] >= 0.65, f"{metrics=}"
|
744
744
|
finally:
|
745
745
|
pass
|
746
746
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.5.
|
1
|
+
__version__ = "0.3.5.post2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.5.
|
3
|
+
Version: 0.3.5.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -264,7 +264,7 @@ Requires-Dist: torchao; extra == "runtime-common"
|
|
264
264
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
265
265
|
Requires-Dist: uvloop; extra == "runtime-common"
|
266
266
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
267
|
-
Requires-Dist: outlines
|
267
|
+
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
|
268
268
|
Requires-Dist: modelscope; extra == "runtime-common"
|
269
269
|
Provides-Extra: srt
|
270
270
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
@@ -1,14 +1,15 @@
|
|
1
1
|
sglang/__init__.py,sha256=b_pqO9bR2fjK9En_tigfzKTiQzE8b_hUizY0DAKVk1M,1616
|
2
2
|
sglang/api.py,sha256=3I9YUJNOeCqwKymZec2JR_agjTyKIx4XoT6IGdZ4_Cs,6953
|
3
3
|
sglang/bench_latency.py,sha256=SSqZjcCNO88ExpT94qBZ5CmuA5o0T8wMTBnxLsNMqik,18259
|
4
|
+
sglang/bench_offline_throughput.py,sha256=xBr7gI_ZbrpXXD72Nzu1F228oNyz1jggcblZCeUWJgw,9975
|
4
5
|
sglang/bench_server_latency.py,sha256=N1MODIzcMk74yOWmY19d36aih3ewtHOemLxoieKtdhw,5866
|
5
|
-
sglang/bench_serving.py,sha256=
|
6
|
+
sglang/bench_serving.py,sha256=ytef89P9bqKRaMGXAqq69SmLTlNXWyHyhEraISLKYME,47975
|
6
7
|
sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
|
7
8
|
sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
|
8
9
|
sglang/launch_server.py,sha256=_XIqBcXArYtHTqilOFkYWKZBYXGCMHAxbYOST08LGj0,415
|
9
10
|
sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
|
10
11
|
sglang/utils.py,sha256=eCvD3fZCALr-MuyZxJL7HAeeqqpxAxf4LJrf7OiCbco,11547
|
11
|
-
sglang/version.py,sha256=
|
12
|
+
sglang/version.py,sha256=NlX-QUNR7ogIH-GcgzllsyHox7ItJoycFEUM_EYuhW4,28
|
12
13
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
14
|
sglang/lang/chat_template.py,sha256=jprS3-In2FTUoedKwZg-HYvDwU8RTIYntOlf2zoN2sU,14814
|
14
15
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
@@ -26,18 +27,18 @@ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bE
|
|
26
27
|
sglang/srt/conversation.py,sha256=erz6wEXMcSmBlskuUhX2c-MT0EMyqyFpTem9PgastEE,21107
|
27
28
|
sglang/srt/hf_transformers_utils.py,sha256=QbYVTnz0UdaXESPMAaq1OMzzznn95J_l08eXJuB68aU,6618
|
28
29
|
sglang/srt/mm_utils.py,sha256=ml68nWUJhs_FS2FU1oB9UPHKZmF7P2DQHl1ddywn4ao,12272
|
29
|
-
sglang/srt/server.py,sha256=
|
30
|
-
sglang/srt/server_args.py,sha256=
|
31
|
-
sglang/srt/utils.py,sha256=
|
30
|
+
sglang/srt/server.py,sha256=JUYAE8MDGYou_HbmuR10QFZfg319fGt9VamskvBkpFo,28776
|
31
|
+
sglang/srt/server_args.py,sha256=V8sx2oY0yphHC_uATwv4UTiLUFnvMQl85o6y5AyaoXM,30086
|
32
|
+
sglang/srt/utils.py,sha256=jGSlxbvI50xEybdupDQNHpsCaF1U_5buADrD149766g,27013
|
32
33
|
sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
|
33
34
|
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
34
35
|
sglang/srt/configs/model_config.py,sha256=mBXeDfFUijQnxd38gVGJ6QxgsiitDklfHvbjYBJFKQY,9470
|
35
36
|
sglang/srt/configs/qwen2vl.py,sha256=AYHuFgJ0bwhWYkD7S6fvP7yJejJnuhy4xp5Q2W-O6ps,4424
|
36
37
|
sglang/srt/constrained/__init__.py,sha256=LHj0-NxDQ7S_N3Pc1gJ-FmIJVN_PTP9ytitWOICSMHk,691
|
37
|
-
sglang/srt/constrained/base_grammar_backend.py,sha256=
|
38
|
-
sglang/srt/constrained/outlines_backend.py,sha256=
|
38
|
+
sglang/srt/constrained/base_grammar_backend.py,sha256=OPuBSd_F_fRwjVj6YFWBQuGeikj7UQtkTvc-JgEYt4I,2259
|
39
|
+
sglang/srt/constrained/outlines_backend.py,sha256=J03QQiT9pkdXyoYGw3Rj6taEyWlIr4VCBvxQ3aMiB8A,5786
|
39
40
|
sglang/srt/constrained/outlines_jump_forward.py,sha256=1fnYxlrc24xjcW3Wx59Hyg0L9hiHIVgMVUsld3UDfW4,6102
|
40
|
-
sglang/srt/constrained/xgrammar_backend.py,sha256=
|
41
|
+
sglang/srt/constrained/xgrammar_backend.py,sha256=wMWqkLN5KhnJXL6GBqbcrhxvAAMx60nG88KIBU1bFSc,4505
|
41
42
|
sglang/srt/layers/activation.py,sha256=7VEkCrx2dvl629Lz0fkJcJfVoZA-ykEdkpTzKEc_drQ,5225
|
42
43
|
sglang/srt/layers/layernorm.py,sha256=HCj8Y_X6MNNdtQU2sWKgyjIqVERxl9dqrmjbBbyJjpE,3796
|
43
44
|
sglang/srt/layers/linear.py,sha256=EOdlpAf6srqxzvPpxcv10KFJKedNc22CGP1qEvpRbDg,46131
|
@@ -59,19 +60,19 @@ sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=LnuWqGAba03e2
|
|
59
60
|
sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
|
60
61
|
sglang/srt/layers/fused_moe/fused_moe.py,sha256=N15tWTm2SGuesJxDIJAdV5FsDUpE-15sb_AIgr4swlw,23656
|
61
62
|
sglang/srt/layers/fused_moe/layer.py,sha256=tbHnUJs3uvdDsl3VnwtyGA31VtFouNTPD7h7fPSCYOc,23613
|
62
|
-
sglang/srt/layers/fused_moe/patch.py,sha256=
|
63
|
+
sglang/srt/layers/fused_moe/patch.py,sha256=K5CNLnFVxRPd8_jlY4hW6bj7pAACeCFZQA8y5loqqM4,4029
|
63
64
|
sglang/srt/layers/quantization/__init__.py,sha256=QilMNqgu3eOFUkEjXLSDa1NvoNdi_CAvC8a1hprOgN8,2979
|
64
65
|
sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
|
65
66
|
sglang/srt/lora/lora.py,sha256=meRL7oBUx8mxV_isc3Lp0EIsFQWC2PvaN-fE78BmMwg,14970
|
66
67
|
sglang/srt/lora/lora_config.py,sha256=paVB7F7SIuxr_vodvKf8zzAlH2fdVYHhXxcXV62D0Vo,1411
|
67
68
|
sglang/srt/lora/lora_manager.py,sha256=gzBwYXZEPYj56PkGTshTbWRfl_370wb6uTcRhDaLiF8,12801
|
68
69
|
sglang/srt/managers/data_parallel_controller.py,sha256=_XB6Ianc8TiqwLTW-7DH6gGjVYBeBU_6WjjaDk0snIY,5686
|
69
|
-
sglang/srt/managers/detokenizer_manager.py,sha256=
|
70
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=erRgf8RijFrGnYjZawu9an1u2mFPRY3tnxzF9PbKc80,7295
|
70
71
|
sglang/srt/managers/image_processor.py,sha256=Pk_dtXzljTkFt7Acsv1RyDzEqvCvjc7BMngxGhtkpDU,13817
|
71
72
|
sglang/srt/managers/io_struct.py,sha256=O_oHnikwmOexNqH4HP6bwAI5d_jG_C96JGapkLg8B7c,12289
|
72
73
|
sglang/srt/managers/schedule_batch.py,sha256=4BgocYdKFTDCrrBkSXCT75EALBx-3RYnoN3SgtdsHlU,39595
|
73
74
|
sglang/srt/managers/schedule_policy.py,sha256=LH0rh1PiI5LK-dSd3dar8_po6FidiBUuj0Xcp_yNQAA,12295
|
74
|
-
sglang/srt/managers/scheduler.py,sha256=
|
75
|
+
sglang/srt/managers/scheduler.py,sha256=ty1sJ9U6JxifIGF4uzZX6CANMJtbjNWPe2k8aRPS6aI,48133
|
75
76
|
sglang/srt/managers/tokenizer_manager.py,sha256=n_XCsCOwLZWCLv1ZJLGjyKgrAWCAQDyEhjnkxOptSa8,24436
|
76
77
|
sglang/srt/managers/tp_worker.py,sha256=S5oim5xrkg1j68hYq6LfC8T533JYmQX9Kabt6U8ZXn4,5726
|
77
78
|
sglang/srt/managers/tp_worker_overlap_thread.py,sha256=j5J4yHyR7w2HgAbN7S__299ADvsoyap5HK63SWMNavQ,7546
|
@@ -84,7 +85,7 @@ sglang/srt/metrics/collector.py,sha256=9kidVhr4ldbSntAYfzwJt_2CTUFnnej0OoQdxUUwU
|
|
84
85
|
sglang/srt/metrics/func_timer.py,sha256=xe9UT4bPP1mA4GRZLsCd708cmv1B00hMpUmF7hzAKB4,3344
|
85
86
|
sglang/srt/model_executor/cuda_graph_runner.py,sha256=ZMkyfZpWgDXfBpJ4cenh1TxXtt1O2xqeiXhDkq6E5pU,12936
|
86
87
|
sglang/srt/model_executor/forward_batch_info.py,sha256=61TVExbiXDQRvZ6oevNz9AIxG7e-KVddgj4I6MTivLg,9426
|
87
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
88
|
+
sglang/srt/model_executor/model_runner.py,sha256=QdFjQRnxZU8r7-MP-NdsnFnPWMRfxa-zTUmKOYmM8HE,26879
|
88
89
|
sglang/srt/models/baichuan.py,sha256=RyvPQvi7wy9VUGvLwG17XttcTp43yRj6c3zNRImBToA,15005
|
89
90
|
sglang/srt/models/chatglm.py,sha256=9hCXTqGX8DMvSPSn6wlK0YNNRWGS4UiS4-xjFsO9hYU,13135
|
90
91
|
sglang/srt/models/commandr.py,sha256=leoQNn4VRqa9SXos6DcrkHVG6-Xp-kjBn2PUgqc9bs8,14051
|
@@ -123,10 +124,10 @@ sglang/srt/models/torch_native_llama.py,sha256=d8gVNurlVVZ-tD3Uc_aHyGCVUUp1gR8aw
|
|
123
124
|
sglang/srt/models/xverse.py,sha256=meyCCdrZRYNK70hnmydgwhHa1FTBhKekEdpG0_IGTWY,13564
|
124
125
|
sglang/srt/models/xverse_moe.py,sha256=xlrhJBAlRzxhp5o0WQU_2V5Uvf8I9fwZLOZBh95o3to,15673
|
125
126
|
sglang/srt/models/yivl.py,sha256=xcWqkuZ29FmBBJY6aKetwItWIPl-kfXK-QmgdLONles,4765
|
126
|
-
sglang/srt/openai_api/adapter.py,sha256=
|
127
|
-
sglang/srt/openai_api/protocol.py,sha256=
|
127
|
+
sglang/srt/openai_api/adapter.py,sha256=xYBmBLZ_JxfMt_m8LtVe_OB70GV4S9zBOL8e5g_VRvs,53432
|
128
|
+
sglang/srt/openai_api/protocol.py,sha256=Mou5JUMKJkxVxoj4n8R4_sgnYY3OcwniiAi2TEM3hfY,10070
|
128
129
|
sglang/srt/sampling/sampling_batch_info.py,sha256=7uoHypbbp4o71DfPmF22R_LeyM_Q9BTxBFg8O4lkd9w,7648
|
129
|
-
sglang/srt/sampling/sampling_params.py,sha256=
|
130
|
+
sglang/srt/sampling/sampling_params.py,sha256=zzWVm8DxcUDdPwV1MIh5q76mmLwtkun0E08T6U3ZyWA,5192
|
130
131
|
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
131
132
|
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=kizcPnxtRawmDt6utRuhbk4yfNs5H5mx1DAlDVEZRv8,11328
|
132
133
|
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
|
@@ -146,10 +147,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
|
|
146
147
|
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
147
148
|
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
148
149
|
sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
|
149
|
-
sglang/test/test_utils.py,sha256=
|
150
|
+
sglang/test/test_utils.py,sha256=XvIAMeLXr4D7uLxCUSLTKP5Upc1EJd0JX2egL897Jfo,23100
|
150
151
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=q98pQDikkmvvvvAG-AXMYaYte1iHHW2TFhKGtAeGvdE,12802
|
151
|
-
sglang-0.3.5.
|
152
|
-
sglang-0.3.5.
|
153
|
-
sglang-0.3.5.
|
154
|
-
sglang-0.3.5.
|
155
|
-
sglang-0.3.5.
|
152
|
+
sglang-0.3.5.post2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
153
|
+
sglang-0.3.5.post2.dist-info/METADATA,sha256=ajoktPOWOAmE37TcZw562A22FmxntBUWO4zLOShVKpQ,21568
|
154
|
+
sglang-0.3.5.post2.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
155
|
+
sglang-0.3.5.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
156
|
+
sglang-0.3.5.post2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|