sglang 0.3.0__py3-none-any.whl → 0.3.1.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_latency.py +17 -8
- sglang/bench_serving.py +33 -38
- sglang/global_config.py +5 -17
- sglang/lang/backend/runtime_endpoint.py +5 -2
- sglang/lang/interpreter.py +1 -4
- sglang/launch_server.py +3 -6
- sglang/launch_server_llavavid.py +7 -8
- sglang/srt/{model_config.py → configs/model_config.py} +5 -0
- sglang/srt/constrained/__init__.py +2 -0
- sglang/srt/constrained/fsm_cache.py +33 -38
- sglang/srt/constrained/jump_forward.py +0 -1
- sglang/srt/conversation.py +4 -1
- sglang/srt/hf_transformers_utils.py +1 -3
- sglang/srt/layers/activation.py +12 -0
- sglang/srt/layers/attention_backend.py +480 -0
- sglang/srt/layers/flashinfer_utils.py +235 -0
- sglang/srt/layers/fused_moe/layer.py +27 -7
- sglang/srt/layers/layernorm.py +12 -0
- sglang/srt/layers/logits_processor.py +64 -77
- sglang/srt/layers/radix_attention.py +11 -161
- sglang/srt/layers/sampler.py +38 -122
- sglang/srt/layers/torchao_utils.py +75 -0
- sglang/srt/layers/{decode_attention.py → triton_attention/decode_attention.py} +67 -63
- sglang/srt/layers/{extend_attention.py → triton_attention/extend_attention.py} +40 -132
- sglang/srt/layers/{prefill_attention.py → triton_attention/prefill_attention.py} +13 -7
- sglang/srt/lora/lora.py +403 -0
- sglang/srt/lora/lora_config.py +43 -0
- sglang/srt/lora/lora_manager.py +259 -0
- sglang/srt/managers/controller_multi.py +1 -5
- sglang/srt/managers/controller_single.py +0 -5
- sglang/srt/managers/io_struct.py +16 -1
- sglang/srt/managers/policy_scheduler.py +122 -5
- sglang/srt/managers/schedule_batch.py +105 -71
- sglang/srt/managers/tokenizer_manager.py +17 -8
- sglang/srt/managers/tp_worker.py +188 -121
- sglang/srt/model_executor/cuda_graph_runner.py +69 -133
- sglang/srt/model_executor/forward_batch_info.py +35 -312
- sglang/srt/model_executor/model_runner.py +123 -154
- sglang/srt/models/baichuan.py +416 -0
- sglang/srt/models/chatglm.py +1 -5
- sglang/srt/models/commandr.py +1 -5
- sglang/srt/models/dbrx.py +1 -5
- sglang/srt/models/deepseek.py +1 -5
- sglang/srt/models/deepseek_v2.py +7 -6
- sglang/srt/models/exaone.py +1 -5
- sglang/srt/models/gemma.py +1 -5
- sglang/srt/models/gemma2.py +1 -5
- sglang/srt/models/gpt_bigcode.py +1 -5
- sglang/srt/models/grok.py +1 -5
- sglang/srt/models/internlm2.py +1 -5
- sglang/srt/models/llama.py +51 -5
- sglang/srt/models/llama_classification.py +1 -20
- sglang/srt/models/llava.py +30 -5
- sglang/srt/models/llavavid.py +2 -2
- sglang/srt/models/minicpm.py +1 -5
- sglang/srt/models/minicpm3.py +669 -0
- sglang/srt/models/mixtral.py +6 -5
- sglang/srt/models/mixtral_quant.py +1 -5
- sglang/srt/models/olmoe.py +415 -0
- sglang/srt/models/qwen.py +1 -5
- sglang/srt/models/qwen2.py +1 -5
- sglang/srt/models/qwen2_moe.py +6 -5
- sglang/srt/models/stablelm.py +1 -5
- sglang/srt/models/xverse.py +375 -0
- sglang/srt/models/xverse_moe.py +445 -0
- sglang/srt/openai_api/adapter.py +65 -46
- sglang/srt/openai_api/protocol.py +11 -3
- sglang/srt/sampling/sampling_batch_info.py +46 -80
- sglang/srt/server.py +30 -15
- sglang/srt/server_args.py +163 -28
- sglang/srt/utils.py +19 -51
- sglang/test/few_shot_gsm8k.py +132 -0
- sglang/test/runners.py +114 -22
- sglang/test/test_programs.py +7 -5
- sglang/test/test_utils.py +85 -2
- sglang/utils.py +32 -37
- sglang/version.py +1 -1
- {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/METADATA +30 -18
- sglang-0.3.1.post1.dist-info/RECORD +130 -0
- {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/WHEEL +1 -1
- sglang-0.3.0.dist-info/RECORD +0 -118
- {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/LICENSE +0 -0
- {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/top_level.txt +0 -0
sglang/test/runners.py
CHANGED
@@ -21,6 +21,7 @@ from typing import List, Union
|
|
21
21
|
|
22
22
|
import torch
|
23
23
|
import torch.nn.functional as F
|
24
|
+
from peft import PeftModel
|
24
25
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
25
26
|
|
26
27
|
from sglang.srt.server import Runtime
|
@@ -50,6 +51,13 @@ def get_dtype_str(torch_dtype):
|
|
50
51
|
raise NotImplementedError()
|
51
52
|
|
52
53
|
|
54
|
+
def get_top_logprobs(logits, k):
|
55
|
+
logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
|
56
|
+
del logits
|
57
|
+
logprobs, top_indices = torch.topk(logprobs, k=k, dim=-1)
|
58
|
+
return logprobs
|
59
|
+
|
60
|
+
|
53
61
|
@dataclass
|
54
62
|
class ModelOutput:
|
55
63
|
output_strs: List[str] = None
|
@@ -65,8 +73,10 @@ class HFRunner:
|
|
65
73
|
model_path,
|
66
74
|
torch_dtype,
|
67
75
|
is_generation,
|
76
|
+
output_str_only=False,
|
68
77
|
):
|
69
78
|
self.is_generation = is_generation
|
79
|
+
self.output_str_only = output_str_only
|
70
80
|
|
71
81
|
self.in_queue = mp.Queue()
|
72
82
|
self.out_queue = mp.Queue()
|
@@ -89,7 +99,7 @@ class HFRunner:
|
|
89
99
|
)
|
90
100
|
|
91
101
|
if self.is_generation:
|
92
|
-
self.
|
102
|
+
self.base_model = AutoModelForCausalLM.from_pretrained(
|
93
103
|
model_path,
|
94
104
|
torch_dtype=torch_dtype,
|
95
105
|
trust_remote_code=False,
|
@@ -104,12 +114,16 @@ class HFRunner:
|
|
104
114
|
)
|
105
115
|
|
106
116
|
while True:
|
107
|
-
prompts, max_new_tokens = in_queue.get()
|
117
|
+
prompts, max_new_tokens, lora_paths = in_queue.get()
|
118
|
+
if lora_paths is not None:
|
119
|
+
assert len(prompts) == len(lora_paths)
|
120
|
+
|
108
121
|
if prompts is not None:
|
109
122
|
if self.is_generation:
|
110
123
|
output_strs = []
|
111
|
-
|
112
|
-
|
124
|
+
top_input_logprobs = []
|
125
|
+
top_output_logprobs = []
|
126
|
+
for i, p in enumerate(prompts):
|
113
127
|
if isinstance(p, str):
|
114
128
|
input_ids = self.tokenizer.encode(
|
115
129
|
p, return_tensors="pt"
|
@@ -117,40 +131,68 @@ class HFRunner:
|
|
117
131
|
else:
|
118
132
|
input_ids = torch.tensor([p], device="cuda")
|
119
133
|
|
120
|
-
|
121
|
-
|
134
|
+
if lora_paths is not None and lora_paths[i] is not None:
|
135
|
+
self.model = PeftModel.from_pretrained(
|
136
|
+
self.base_model,
|
137
|
+
lora_paths[i],
|
138
|
+
torch_dtype=torch_dtype,
|
139
|
+
is_trainable=False,
|
140
|
+
)
|
141
|
+
else:
|
142
|
+
self.model = self.base_model
|
143
|
+
|
144
|
+
outputs = self.model.generate(
|
145
|
+
input_ids,
|
146
|
+
do_sample=False,
|
147
|
+
temperature=None,
|
148
|
+
top_p=None,
|
149
|
+
max_new_tokens=max_new_tokens,
|
150
|
+
return_dict_in_generate=True,
|
151
|
+
output_scores=(not self.output_str_only),
|
122
152
|
)
|
123
153
|
output_strs.append(
|
124
|
-
self.tokenizer.decode(
|
154
|
+
self.tokenizer.decode(outputs[0][0][len(input_ids[0]) :])
|
125
155
|
)
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
156
|
+
if not self.output_str_only:
|
157
|
+
# outputs.scores: (num_token, 1, vocab_size)
|
158
|
+
top_output_logprobs.append(
|
159
|
+
[
|
160
|
+
get_top_logprobs(
|
161
|
+
logits[0], NUM_TOP_LOGPROBS
|
162
|
+
).tolist()
|
163
|
+
for logits in outputs.scores
|
164
|
+
]
|
165
|
+
)
|
166
|
+
del outputs
|
167
|
+
|
168
|
+
input_logits = self.model.forward(input_ids).logits[0]
|
169
|
+
top_input_logprobs.append(
|
170
|
+
get_top_logprobs(
|
171
|
+
input_logits, NUM_TOP_LOGPROBS
|
172
|
+
).tolist()
|
173
|
+
)
|
174
|
+
del input_logits
|
136
175
|
|
137
176
|
out_queue.put(
|
138
177
|
ModelOutput(
|
139
|
-
output_strs=output_strs,
|
178
|
+
output_strs=output_strs,
|
179
|
+
top_input_logprobs=top_input_logprobs,
|
180
|
+
top_output_logprobs=top_output_logprobs,
|
140
181
|
)
|
141
182
|
)
|
142
183
|
|
143
184
|
else:
|
185
|
+
assert not self.output_str_only
|
144
186
|
logits = self.model.encode(prompts).tolist()
|
145
|
-
|
146
187
|
out_queue.put(ModelOutput(embed_logits=logits))
|
147
188
|
|
148
189
|
def forward(
|
149
190
|
self,
|
150
191
|
prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
|
151
192
|
max_new_tokens=8,
|
193
|
+
lora_paths=None,
|
152
194
|
):
|
153
|
-
self.in_queue.put((prompts, max_new_tokens))
|
195
|
+
self.in_queue.put((prompts, max_new_tokens, lora_paths))
|
154
196
|
return self.out_queue.get()
|
155
197
|
|
156
198
|
def terminate(self):
|
@@ -173,6 +215,10 @@ class SRTRunner:
|
|
173
215
|
is_generation,
|
174
216
|
tp_size=1,
|
175
217
|
port=DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
|
218
|
+
lora_paths=None,
|
219
|
+
max_loras_per_batch=4,
|
220
|
+
disable_cuda_graph=False,
|
221
|
+
disable_radix_cache=False,
|
176
222
|
):
|
177
223
|
self.is_generation = is_generation
|
178
224
|
self.runtime = Runtime(
|
@@ -183,21 +229,28 @@ class SRTRunner:
|
|
183
229
|
mem_fraction_static=0.69,
|
184
230
|
trust_remote_code=False,
|
185
231
|
is_embedding=not self.is_generation,
|
232
|
+
lora_paths=lora_paths,
|
233
|
+
max_loras_per_batch=max_loras_per_batch,
|
234
|
+
disable_cuda_graph=disable_cuda_graph,
|
235
|
+
disable_radix_cache=disable_radix_cache,
|
186
236
|
)
|
187
237
|
|
188
238
|
def forward(
|
189
239
|
self,
|
190
240
|
prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
|
191
241
|
max_new_tokens=8,
|
242
|
+
lora_paths=None,
|
192
243
|
):
|
193
244
|
if self.is_generation:
|
194
245
|
# the return value contains logprobs from prefill
|
195
246
|
output_strs = []
|
196
247
|
top_input_logprobs = []
|
248
|
+
top_output_logprobs = []
|
197
249
|
sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
|
198
|
-
for prompt in prompts:
|
250
|
+
for i, prompt in enumerate(prompts):
|
199
251
|
response = self.runtime.generate(
|
200
252
|
prompt,
|
253
|
+
lora_path=lora_paths[i] if lora_paths else None,
|
201
254
|
sampling_params=sampling_params,
|
202
255
|
return_logprob=True,
|
203
256
|
logprob_start_len=0,
|
@@ -219,9 +272,48 @@ class SRTRunner:
|
|
219
272
|
]
|
220
273
|
]
|
221
274
|
)
|
275
|
+
top_output_logprobs.append(
|
276
|
+
[
|
277
|
+
[tup[0] for tup in x[:NUM_TOP_LOGPROBS]]
|
278
|
+
for x in response["meta_info"]["output_top_logprobs"]
|
279
|
+
]
|
280
|
+
)
|
281
|
+
|
282
|
+
return ModelOutput(
|
283
|
+
output_strs=output_strs,
|
284
|
+
top_input_logprobs=top_input_logprobs,
|
285
|
+
top_output_logprobs=top_output_logprobs,
|
286
|
+
)
|
287
|
+
else:
|
288
|
+
response = self.runtime.encode(prompts)
|
289
|
+
response = json.loads(response)
|
290
|
+
logits = [x["embedding"] for x in response]
|
291
|
+
return ModelOutput(embed_logits=logits)
|
292
|
+
|
293
|
+
def batch_forward(
|
294
|
+
self,
|
295
|
+
prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
|
296
|
+
max_new_tokens=8,
|
297
|
+
lora_paths=None,
|
298
|
+
):
|
299
|
+
"""
|
300
|
+
testing serving by sending all prompts once
|
301
|
+
only return output strings and no logprobs
|
302
|
+
"""
|
303
|
+
if self.is_generation:
|
304
|
+
# the return value contains logprobs from prefill
|
305
|
+
output_strs = []
|
306
|
+
sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
|
307
|
+
response = self.runtime.generate(
|
308
|
+
prompts,
|
309
|
+
lora_path=lora_paths if lora_paths else None,
|
310
|
+
sampling_params=sampling_params,
|
311
|
+
)
|
312
|
+
response = json.loads(response)
|
313
|
+
output_strs = [r["text"] for r in response]
|
222
314
|
|
223
315
|
return ModelOutput(
|
224
|
-
output_strs=output_strs,
|
316
|
+
output_strs=output_strs,
|
225
317
|
)
|
226
318
|
else:
|
227
319
|
response = self.runtime.encode(prompts)
|
sglang/test/test_programs.py
CHANGED
@@ -7,7 +7,7 @@ import time
|
|
7
7
|
import numpy as np
|
8
8
|
|
9
9
|
import sglang as sgl
|
10
|
-
from sglang.utils import
|
10
|
+
from sglang.utils import download_and_cache_file, read_jsonl
|
11
11
|
|
12
12
|
|
13
13
|
def test_few_shot_qa():
|
@@ -456,10 +456,6 @@ def test_chat_completion_speculative():
|
|
456
456
|
def test_hellaswag_select():
|
457
457
|
"""Benchmark the accuracy of sgl.select on the HellaSwag dataset."""
|
458
458
|
|
459
|
-
url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
|
460
|
-
lines = fetch_and_cache_jsonl(url)
|
461
|
-
|
462
|
-
# Construct prompts
|
463
459
|
def get_one_example(lines, i, include_answer):
|
464
460
|
ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
|
465
461
|
if include_answer:
|
@@ -472,6 +468,12 @@ def test_hellaswag_select():
|
|
472
468
|
ret += get_one_example(lines, i, True) + "\n\n"
|
473
469
|
return ret
|
474
470
|
|
471
|
+
# Read data
|
472
|
+
url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
|
473
|
+
filename = download_and_cache_file(url)
|
474
|
+
lines = list(read_jsonl(filename))
|
475
|
+
|
476
|
+
# Construct prompts
|
475
477
|
num_questions = 200
|
476
478
|
num_shots = 20
|
477
479
|
few_shot_examples = get_few_shot_examples(lines, num_shots)
|
sglang/test/test_utils.py
CHANGED
@@ -7,6 +7,7 @@ import subprocess
|
|
7
7
|
import threading
|
8
8
|
import time
|
9
9
|
from functools import partial
|
10
|
+
from types import SimpleNamespace
|
10
11
|
from typing import Callable, List, Optional
|
11
12
|
|
12
13
|
import numpy as np
|
@@ -14,6 +15,7 @@ import requests
|
|
14
15
|
import torch
|
15
16
|
import torch.nn.functional as F
|
16
17
|
|
18
|
+
from sglang.bench_serving import run_benchmark
|
17
19
|
from sglang.global_config import global_config
|
18
20
|
from sglang.lang.backend.openai import OpenAI
|
19
21
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
@@ -28,7 +30,13 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruc
|
|
28
30
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
|
29
31
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
|
30
32
|
|
31
|
-
|
33
|
+
|
34
|
+
def is_in_ci():
|
35
|
+
"""Return whether it is in CI runner."""
|
36
|
+
return os.getenv("SGLANG_IS_IN_CI", "false") == "true"
|
37
|
+
|
38
|
+
|
39
|
+
if is_in_ci():
|
32
40
|
DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
|
33
41
|
DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157"
|
34
42
|
else:
|
@@ -296,7 +304,6 @@ def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser):
|
|
296
304
|
def select_sglang_backend(args: argparse.Namespace):
|
297
305
|
if args.backend.startswith("srt"):
|
298
306
|
if args.backend == "srt-no-parallel":
|
299
|
-
global_config.enable_parallel_decoding = False
|
300
307
|
global_config.enable_parallel_encoding = False
|
301
308
|
backend = RuntimeEndpoint(f"{args.host}:{args.port}")
|
302
309
|
elif args.backend.startswith("gpt-"):
|
@@ -501,3 +508,79 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
|
|
501
508
|
|
502
509
|
def get_similarities(vec1, vec2):
|
503
510
|
return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
|
511
|
+
|
512
|
+
|
513
|
+
def run_bench_serving(model, num_prompts, request_rate, other_server_args):
|
514
|
+
# Launch the server
|
515
|
+
base_url = DEFAULT_URL_FOR_TEST
|
516
|
+
process = popen_launch_server(
|
517
|
+
model,
|
518
|
+
base_url,
|
519
|
+
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
520
|
+
other_args=other_server_args,
|
521
|
+
)
|
522
|
+
|
523
|
+
# Run benchmark
|
524
|
+
args = SimpleNamespace(
|
525
|
+
backend="sglang",
|
526
|
+
base_url=base_url,
|
527
|
+
host=None,
|
528
|
+
port=None,
|
529
|
+
dataset_name="random",
|
530
|
+
dataset_path="",
|
531
|
+
model=None,
|
532
|
+
tokenizer=None,
|
533
|
+
num_prompts=num_prompts,
|
534
|
+
sharegpt_output_len=None,
|
535
|
+
random_input_len=4096,
|
536
|
+
random_output_len=2048,
|
537
|
+
random_range_ratio=0.0,
|
538
|
+
request_rate=request_rate,
|
539
|
+
multi=None,
|
540
|
+
seed=0,
|
541
|
+
output_file=None,
|
542
|
+
disable_tqdm=False,
|
543
|
+
disable_stream=False,
|
544
|
+
disable_ignore_eos=False,
|
545
|
+
extra_request_body=None,
|
546
|
+
)
|
547
|
+
|
548
|
+
try:
|
549
|
+
res = run_benchmark(args)
|
550
|
+
finally:
|
551
|
+
kill_child_process(process.pid)
|
552
|
+
|
553
|
+
assert res["completed"] == num_prompts
|
554
|
+
return res
|
555
|
+
|
556
|
+
|
557
|
+
def run_bench_latency(model, other_args):
|
558
|
+
command = [
|
559
|
+
"python3",
|
560
|
+
"-m",
|
561
|
+
"sglang.bench_latency",
|
562
|
+
"--model-path",
|
563
|
+
model,
|
564
|
+
"--batch-size",
|
565
|
+
"1",
|
566
|
+
"--input",
|
567
|
+
"128",
|
568
|
+
"--output",
|
569
|
+
"8",
|
570
|
+
*other_args,
|
571
|
+
]
|
572
|
+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
573
|
+
|
574
|
+
try:
|
575
|
+
stdout, stderr = process.communicate()
|
576
|
+
output = stdout.decode()
|
577
|
+
error = stderr.decode()
|
578
|
+
print(f"Output: {output}", flush=True)
|
579
|
+
print(f"Error: {error}", flush=True)
|
580
|
+
|
581
|
+
lastline = output.split("\n")[-3]
|
582
|
+
output_throughput = float(lastline.split(" ")[-2])
|
583
|
+
finally:
|
584
|
+
kill_child_process(process.pid)
|
585
|
+
|
586
|
+
return output_throughput
|
sglang/utils.py
CHANGED
@@ -12,7 +12,7 @@ import urllib.request
|
|
12
12
|
from concurrent.futures import ThreadPoolExecutor
|
13
13
|
from io import BytesIO
|
14
14
|
from json import dumps
|
15
|
-
from typing import Union
|
15
|
+
from typing import Optional, Union
|
16
16
|
|
17
17
|
import numpy as np
|
18
18
|
import requests
|
@@ -38,13 +38,11 @@ def is_same_type(values: list):
|
|
38
38
|
|
39
39
|
def read_jsonl(filename: str):
|
40
40
|
"""Read a JSONL file."""
|
41
|
-
rets = []
|
42
41
|
with open(filename) as fin:
|
43
42
|
for line in fin:
|
44
43
|
if line.startswith("#"):
|
45
44
|
continue
|
46
|
-
|
47
|
-
return rets
|
45
|
+
yield json.loads(line)
|
48
46
|
|
49
47
|
|
50
48
|
def dump_state_text(filename: str, states: list, mode: str = "w"):
|
@@ -264,38 +262,35 @@ class LazyImport:
|
|
264
262
|
return module(*args, **kwargs)
|
265
263
|
|
266
264
|
|
267
|
-
def
|
268
|
-
"""Read and cache a
|
265
|
+
def download_and_cache_file(url: str, filename: Optional[str] = None):
|
266
|
+
"""Read and cache a file from a url."""
|
267
|
+
if filename is None:
|
268
|
+
filename = os.path.join("/tmp", url.split("/")[-1])
|
269
269
|
|
270
270
|
# Check if the cache file already exists
|
271
|
-
if os.path.exists(
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
# Convert the data to a list of dictionaries
|
298
|
-
with open(cache_file, "r") as f:
|
299
|
-
data = [json.loads(line) for line in f]
|
300
|
-
|
301
|
-
return data
|
271
|
+
if os.path.exists(filename):
|
272
|
+
return filename
|
273
|
+
|
274
|
+
print(f"Downloading from {url} to {filename}")
|
275
|
+
|
276
|
+
# Stream the response to show the progress bar
|
277
|
+
response = requests.get(url, stream=True)
|
278
|
+
response.raise_for_status() # Check for request errors
|
279
|
+
|
280
|
+
# Total size of the file in bytes
|
281
|
+
total_size = int(response.headers.get("content-length", 0))
|
282
|
+
chunk_size = 1024 # Download in chunks of 1KB
|
283
|
+
|
284
|
+
# Use tqdm to display the progress bar
|
285
|
+
with open(filename, "wb") as f, tqdm(
|
286
|
+
desc=filename,
|
287
|
+
total=total_size,
|
288
|
+
unit="B",
|
289
|
+
unit_scale=True,
|
290
|
+
unit_divisor=1024,
|
291
|
+
) as bar:
|
292
|
+
for chunk in response.iter_content(chunk_size=chunk_size):
|
293
|
+
f.write(chunk)
|
294
|
+
bar.update(len(chunk))
|
295
|
+
|
296
|
+
return filename
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.
|
1
|
+
__version__ = "0.3.1.post1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.1.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -242,6 +242,7 @@ Requires-Dist: psutil; extra == "srt"
|
|
242
242
|
Requires-Dist: pydantic; extra == "srt"
|
243
243
|
Requires-Dist: python-multipart; extra == "srt"
|
244
244
|
Requires-Dist: torch; extra == "srt"
|
245
|
+
Requires-Dist: torchao; extra == "srt"
|
245
246
|
Requires-Dist: uvicorn; extra == "srt"
|
246
247
|
Requires-Dist: uvloop; extra == "srt"
|
247
248
|
Requires-Dist: zmq; extra == "srt"
|
@@ -253,6 +254,7 @@ Requires-Dist: matplotlib; extra == "test"
|
|
253
254
|
Requires-Dist: pandas; extra == "test"
|
254
255
|
Requires-Dist: sentence-transformers; extra == "test"
|
255
256
|
Requires-Dist: accelerate; extra == "test"
|
257
|
+
Requires-Dist: peft; extra == "test"
|
256
258
|
|
257
259
|
<div align="center">
|
258
260
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
@@ -271,14 +273,16 @@ Requires-Dist: accelerate; extra == "test"
|
|
271
273
|
|
272
274
|
SGLang is a fast serving framework for large language models and vision language models.
|
273
275
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
274
|
-
|
275
276
|
The core features include:
|
276
|
-
|
277
|
-
- **
|
277
|
+
|
278
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
279
|
+
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
280
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
281
|
+
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption, welcoming contributions to improve LLM and VLM serving.
|
278
282
|
|
279
283
|
## News
|
284
|
+
- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
280
285
|
- [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
281
|
-
- [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
|
282
286
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
283
287
|
|
284
288
|
<details>
|
@@ -300,6 +304,8 @@ The core features include:
|
|
300
304
|
|
301
305
|
## Install
|
302
306
|
|
307
|
+
You can install SGLang using any of the methods below.
|
308
|
+
|
303
309
|
### Method 1: With pip
|
304
310
|
```
|
305
311
|
pip install --upgrade pip
|
@@ -312,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
312
318
|
### Method 2: From source
|
313
319
|
```
|
314
320
|
# Use the last release branch
|
315
|
-
git clone -b v0.3.
|
321
|
+
git clone -b v0.3.1.post1 https://github.com/sgl-project/sglang.git
|
316
322
|
cd sglang
|
317
323
|
|
318
324
|
pip install --upgrade pip
|
@@ -323,7 +329,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
323
329
|
```
|
324
330
|
|
325
331
|
### Method 3: Using docker
|
326
|
-
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
|
332
|
+
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
|
327
333
|
Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
|
328
334
|
|
329
335
|
```bash
|
@@ -391,7 +397,7 @@ sky status --endpoint 30000 sglang
|
|
391
397
|
|
392
398
|
|
393
399
|
### Common Notes
|
394
|
-
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is
|
400
|
+
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is the default attention kernel backend. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), please switch to other kernels by adding `--attention-backend triton --sampling-backend pytorch` and open an issue on GitHub.
|
395
401
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
396
402
|
|
397
403
|
## Backend: SGLang Runtime (SRT)
|
@@ -457,24 +463,29 @@ print(response)
|
|
457
463
|
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
458
464
|
|
459
465
|
### Additional Server Arguments
|
460
|
-
-
|
466
|
+
- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
461
467
|
```
|
462
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --
|
468
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 2
|
463
469
|
```
|
464
|
-
-
|
470
|
+
- To enable multi-GPU data parallelism, add `--dp 2`. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total.
|
465
471
|
```
|
466
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --
|
472
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --dp 2 --tp 2
|
467
473
|
```
|
468
474
|
- If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
|
469
475
|
```
|
470
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --
|
476
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --mem-fraction-static 0.7
|
471
477
|
```
|
472
478
|
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
473
479
|
- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
|
474
480
|
```
|
475
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --
|
481
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
476
482
|
```
|
477
|
-
-
|
483
|
+
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
484
|
+
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
485
|
+
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
486
|
+
- To enable DeepSeek MLA acceleration, add `--enable-mla`.
|
487
|
+
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
488
|
+
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
478
489
|
```
|
479
490
|
# Node 0
|
480
491
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
@@ -482,9 +493,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
482
493
|
# Node 1
|
483
494
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
484
495
|
```
|
485
|
-
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
486
|
-
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
487
|
-
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
488
496
|
|
489
497
|
### Supported Models
|
490
498
|
|
@@ -510,6 +518,10 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
510
518
|
- ChatGLM
|
511
519
|
- InternLM 2
|
512
520
|
- Exaone 3
|
521
|
+
- BaiChuan2
|
522
|
+
- MiniCPM / MiniCPM 3
|
523
|
+
- XVERSE / XVERSE MoE
|
524
|
+
|
513
525
|
|
514
526
|
**Embedding Models**
|
515
527
|
|