sglang 0.2.14.post2__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +2 -0
- sglang/bench_latency.py +39 -28
- sglang/lang/backend/runtime_endpoint.py +8 -4
- sglang/lang/interpreter.py +3 -0
- sglang/lang/ir.py +5 -0
- sglang/launch_server_llavavid.py +12 -12
- sglang/srt/configs/__init__.py +5 -0
- sglang/srt/configs/exaone.py +195 -0
- sglang/srt/constrained/fsm_cache.py +1 -1
- sglang/srt/conversation.py +24 -2
- sglang/srt/hf_transformers_utils.py +12 -12
- sglang/srt/layers/extend_attention.py +13 -8
- sglang/srt/layers/logits_processor.py +4 -4
- sglang/srt/layers/sampler.py +94 -17
- sglang/srt/managers/controller_multi.py +5 -5
- sglang/srt/managers/controller_single.py +5 -5
- sglang/srt/managers/io_struct.py +6 -1
- sglang/srt/managers/schedule_batch.py +26 -11
- sglang/srt/managers/tokenizer_manager.py +9 -9
- sglang/srt/managers/tp_worker.py +38 -26
- sglang/srt/model_config.py +3 -3
- sglang/srt/model_executor/cuda_graph_runner.py +26 -9
- sglang/srt/model_executor/forward_batch_info.py +68 -23
- sglang/srt/model_executor/model_runner.py +15 -22
- sglang/srt/models/chatglm.py +9 -15
- sglang/srt/models/commandr.py +5 -1
- sglang/srt/models/dbrx.py +5 -1
- sglang/srt/models/deepseek.py +5 -1
- sglang/srt/models/deepseek_v2.py +57 -25
- sglang/srt/models/exaone.py +368 -0
- sglang/srt/models/gemma.py +5 -1
- sglang/srt/models/gemma2.py +5 -1
- sglang/srt/models/gpt_bigcode.py +5 -1
- sglang/srt/models/grok.py +5 -1
- sglang/srt/models/internlm2.py +5 -1
- sglang/srt/models/{llama2.py → llama.py} +25 -45
- sglang/srt/models/llama_classification.py +34 -41
- sglang/srt/models/llama_embedding.py +7 -6
- sglang/srt/models/llava.py +8 -11
- sglang/srt/models/llavavid.py +5 -6
- sglang/srt/models/minicpm.py +5 -1
- sglang/srt/models/mistral.py +2 -3
- sglang/srt/models/mixtral.py +6 -2
- sglang/srt/models/mixtral_quant.py +5 -1
- sglang/srt/models/qwen.py +5 -2
- sglang/srt/models/qwen2.py +6 -2
- sglang/srt/models/qwen2_moe.py +5 -14
- sglang/srt/models/stablelm.py +5 -1
- sglang/srt/openai_api/adapter.py +16 -1
- sglang/srt/openai_api/protocol.py +5 -5
- sglang/srt/sampling/sampling_batch_info.py +75 -6
- sglang/srt/server.py +6 -6
- sglang/srt/utils.py +0 -3
- sglang/test/runners.py +1 -1
- sglang/test/test_programs.py +68 -0
- sglang/test/test_utils.py +4 -0
- sglang/utils.py +39 -0
- sglang/version.py +1 -1
- {sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/METADATA +9 -8
- sglang-0.3.0.dist-info/RECORD +118 -0
- {sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/WHEEL +1 -1
- sglang-0.2.14.post2.dist-info/RECORD +0 -115
- {sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/LICENSE +0 -0
- {sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/top_level.txt +0 -0
@@ -21,10 +21,59 @@ class SamplingBatchInfo:
|
|
21
21
|
top_ps: torch.Tensor = None
|
22
22
|
top_ks: torch.Tensor = None
|
23
23
|
min_ps: torch.Tensor = None
|
24
|
-
|
24
|
+
|
25
|
+
# Dispatch in CUDA graph
|
26
|
+
need_min_p_sampling: bool = False
|
27
|
+
|
28
|
+
# Bias Tensors
|
25
29
|
logit_bias: torch.Tensor = None
|
26
30
|
vocab_mask: torch.Tensor = None
|
27
31
|
|
32
|
+
# Penalizer
|
33
|
+
penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
|
34
|
+
linear_penalties: torch.Tensor = None
|
35
|
+
scaling_penalties: torch.Tensor = None
|
36
|
+
|
37
|
+
def can_run_in_cuda_graph(self):
|
38
|
+
# Vocab bias and min_ps are not supported in CUDA graph
|
39
|
+
return (
|
40
|
+
self.logit_bias is None
|
41
|
+
and self.vocab_mask is None
|
42
|
+
and self.linear_penalties is None
|
43
|
+
and self.scaling_penalties is None
|
44
|
+
and not self.need_min_p_sampling
|
45
|
+
)
|
46
|
+
|
47
|
+
@classmethod
|
48
|
+
def dummy_one(cls, max_bs: int, vocab_size: int):
|
49
|
+
ret = cls(vocab_size=vocab_size)
|
50
|
+
ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
|
51
|
+
ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
|
52
|
+
ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
|
53
|
+
return ret
|
54
|
+
|
55
|
+
def __getitem__(self, key):
|
56
|
+
if isinstance(key, slice):
|
57
|
+
# NOTE:This method is only used in CUDA graph
|
58
|
+
assert self.can_run_in_cuda_graph()
|
59
|
+
return SamplingBatchInfo(
|
60
|
+
vocab_size=self.vocab_size,
|
61
|
+
temperatures=self.temperatures[key],
|
62
|
+
top_ps=self.top_ps[key],
|
63
|
+
top_ks=self.top_ks[key],
|
64
|
+
)
|
65
|
+
else:
|
66
|
+
raise NotImplementedError
|
67
|
+
|
68
|
+
def inplace_assign(self, bs: int, other: SamplingBatchInfo):
|
69
|
+
# NOTE:This method is only used in CUDA graph
|
70
|
+
assert self.can_run_in_cuda_graph()
|
71
|
+
|
72
|
+
self.vocab_size = other.vocab_size
|
73
|
+
self.temperatures[:bs] = other.temperatures
|
74
|
+
self.top_ps[:bs] = other.top_ps
|
75
|
+
self.top_ks[:bs] = other.top_ks
|
76
|
+
|
28
77
|
@classmethod
|
29
78
|
def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
|
30
79
|
device = "cuda"
|
@@ -45,6 +94,7 @@ class SamplingBatchInfo:
|
|
45
94
|
ret.min_ps = torch.tensor(
|
46
95
|
[r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
|
47
96
|
)
|
97
|
+
ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs)
|
48
98
|
|
49
99
|
# Each penalizers will do nothing if they evaluate themselves as not required by looking at
|
50
100
|
# the sampling_params of the requests (See {_is_required()} of each penalizers). So this
|
@@ -72,6 +122,25 @@ class SamplingBatchInfo:
|
|
72
122
|
|
73
123
|
return ret
|
74
124
|
|
125
|
+
def prepare_penalties(self):
|
126
|
+
self.scaling_penalties = None
|
127
|
+
self.linear_penalties = None
|
128
|
+
|
129
|
+
for penalizer in self.penalizer_orchestrator.penalizers.values():
|
130
|
+
if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer):
|
131
|
+
if penalizer.is_prepared():
|
132
|
+
self.scaling_penalties = penalizer.cumulated_repetition_penalties
|
133
|
+
else:
|
134
|
+
if penalizer.is_prepared():
|
135
|
+
if self.linear_penalties is None:
|
136
|
+
bs = self.penalizer_orchestrator.batch.batch_size()
|
137
|
+
self.linear_penalties = torch.zeros(
|
138
|
+
(bs, self.vocab_size),
|
139
|
+
dtype=torch.float32,
|
140
|
+
device="cuda",
|
141
|
+
)
|
142
|
+
self.linear_penalties = penalizer.apply(self.linear_penalties)
|
143
|
+
|
75
144
|
def update_regex_vocab_mask(self, batch: ScheduleBatch):
|
76
145
|
bs, reqs = batch.batch_size(), batch.reqs
|
77
146
|
device = "cuda"
|
@@ -81,15 +150,15 @@ class SamplingBatchInfo:
|
|
81
150
|
self.vocab_mask = None
|
82
151
|
|
83
152
|
if has_regex:
|
153
|
+
self.vocab_mask = torch.zeros(
|
154
|
+
bs, self.vocab_size, dtype=torch.bool, device=device
|
155
|
+
)
|
84
156
|
for i, req in enumerate(reqs):
|
85
157
|
if req.regex_fsm is not None:
|
86
|
-
|
87
|
-
self.vocab_mask = torch.zeros(
|
88
|
-
bs, self.vocab_size, dtype=torch.bool, device=device
|
89
|
-
)
|
158
|
+
self.vocab_mask[i].fill_(1)
|
90
159
|
self.vocab_mask[i][
|
91
160
|
req.regex_fsm.get_next_instruction(req.regex_fsm_state).tokens
|
92
|
-
] =
|
161
|
+
] = 0
|
93
162
|
|
94
163
|
def filter(self, unfinished_indices: List[int], new_indices: torch.Tensor):
|
95
164
|
self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
|
sglang/srt/server.py
CHANGED
@@ -272,7 +272,7 @@ async def retrieve_file_content(file_id: str):
|
|
272
272
|
|
273
273
|
def launch_server(
|
274
274
|
server_args: ServerArgs,
|
275
|
-
|
275
|
+
model_override_args: Optional[dict] = None,
|
276
276
|
pipe_finish_writer: Optional[mp.connection.Connection] = None,
|
277
277
|
):
|
278
278
|
"""Launch an HTTP server."""
|
@@ -317,7 +317,7 @@ def launch_server(
|
|
317
317
|
tp_rank_range,
|
318
318
|
server_args,
|
319
319
|
ports[3],
|
320
|
-
|
320
|
+
model_override_args,
|
321
321
|
)
|
322
322
|
|
323
323
|
try:
|
@@ -328,7 +328,7 @@ def launch_server(
|
|
328
328
|
return
|
329
329
|
|
330
330
|
# Launch processes
|
331
|
-
tokenizer_manager = TokenizerManager(server_args, port_args,
|
331
|
+
tokenizer_manager = TokenizerManager(server_args, port_args, model_override_args)
|
332
332
|
if server_args.chat_template:
|
333
333
|
load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
|
334
334
|
pipe_controller_reader, pipe_controller_writer = mp.Pipe(duplex=False)
|
@@ -341,7 +341,7 @@ def launch_server(
|
|
341
341
|
|
342
342
|
proc_controller = mp.Process(
|
343
343
|
target=start_controller_process,
|
344
|
-
args=(server_args, port_args, pipe_controller_writer,
|
344
|
+
args=(server_args, port_args, pipe_controller_writer, model_override_args),
|
345
345
|
)
|
346
346
|
proc_controller.start()
|
347
347
|
|
@@ -501,7 +501,7 @@ class Runtime:
|
|
501
501
|
def __init__(
|
502
502
|
self,
|
503
503
|
log_level: str = "error",
|
504
|
-
|
504
|
+
model_override_args: Optional[dict] = None,
|
505
505
|
*args,
|
506
506
|
**kwargs,
|
507
507
|
):
|
@@ -525,7 +525,7 @@ class Runtime:
|
|
525
525
|
|
526
526
|
proc = mp.Process(
|
527
527
|
target=launch_server,
|
528
|
-
args=(self.server_args,
|
528
|
+
args=(self.server_args, model_override_args, pipe_writer),
|
529
529
|
)
|
530
530
|
proc.start()
|
531
531
|
pipe_writer.close()
|
sglang/srt/utils.py
CHANGED
@@ -407,7 +407,6 @@ def monkey_patch_vllm_dummy_weight_loader():
|
|
407
407
|
DummyModelLoader,
|
408
408
|
LoRAConfig,
|
409
409
|
ModelConfig,
|
410
|
-
MultiModalConfig,
|
411
410
|
ParallelConfig,
|
412
411
|
SchedulerConfig,
|
413
412
|
_initialize_model,
|
@@ -422,7 +421,6 @@ def monkey_patch_vllm_dummy_weight_loader():
|
|
422
421
|
model_config: ModelConfig,
|
423
422
|
device_config: DeviceConfig,
|
424
423
|
lora_config: Optional[LoRAConfig],
|
425
|
-
multimodal_config: Optional[MultiModalConfig],
|
426
424
|
parallel_config: ParallelConfig,
|
427
425
|
scheduler_config: SchedulerConfig,
|
428
426
|
cache_config: CacheConfig,
|
@@ -433,7 +431,6 @@ def monkey_patch_vllm_dummy_weight_loader():
|
|
433
431
|
model_config,
|
434
432
|
self.load_config,
|
435
433
|
lora_config,
|
436
|
-
multimodal_config,
|
437
434
|
cache_config,
|
438
435
|
)
|
439
436
|
|
sglang/test/runners.py
CHANGED
sglang/test/test_programs.py
CHANGED
@@ -2,8 +2,12 @@
|
|
2
2
|
|
3
3
|
import json
|
4
4
|
import re
|
5
|
+
import time
|
6
|
+
|
7
|
+
import numpy as np
|
5
8
|
|
6
9
|
import sglang as sgl
|
10
|
+
from sglang.utils import fetch_and_cache_jsonl
|
7
11
|
|
8
12
|
|
9
13
|
def test_few_shot_qa():
|
@@ -447,3 +451,67 @@ def test_chat_completion_speculative():
|
|
447
451
|
)
|
448
452
|
|
449
453
|
gen_character_spec().sync()
|
454
|
+
|
455
|
+
|
456
|
+
def test_hellaswag_select():
|
457
|
+
"""Benchmark the accuracy of sgl.select on the HellaSwag dataset."""
|
458
|
+
|
459
|
+
url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
|
460
|
+
lines = fetch_and_cache_jsonl(url)
|
461
|
+
|
462
|
+
# Construct prompts
|
463
|
+
def get_one_example(lines, i, include_answer):
|
464
|
+
ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
|
465
|
+
if include_answer:
|
466
|
+
ret += lines[i]["endings"][lines[i]["label"]]
|
467
|
+
return ret
|
468
|
+
|
469
|
+
def get_few_shot_examples(lines, k):
|
470
|
+
ret = ""
|
471
|
+
for i in range(k):
|
472
|
+
ret += get_one_example(lines, i, True) + "\n\n"
|
473
|
+
return ret
|
474
|
+
|
475
|
+
num_questions = 200
|
476
|
+
num_shots = 20
|
477
|
+
few_shot_examples = get_few_shot_examples(lines, num_shots)
|
478
|
+
|
479
|
+
questions = []
|
480
|
+
choices = []
|
481
|
+
labels = []
|
482
|
+
for i in range(len(lines[:num_questions])):
|
483
|
+
questions.append(get_one_example(lines, i, False))
|
484
|
+
choices.append(lines[i]["endings"])
|
485
|
+
labels.append(lines[i]["label"])
|
486
|
+
arguments = [{"question": q, "choices": c} for q, c in zip(questions, choices)]
|
487
|
+
|
488
|
+
#####################################
|
489
|
+
######### SGL Program Begin #########
|
490
|
+
#####################################
|
491
|
+
|
492
|
+
import sglang as sgl
|
493
|
+
|
494
|
+
@sgl.function
|
495
|
+
def few_shot_hellaswag(s, question, choices):
|
496
|
+
s += few_shot_examples + question
|
497
|
+
s += sgl.select("answer", choices=choices)
|
498
|
+
|
499
|
+
#####################################
|
500
|
+
########## SGL Program End ##########
|
501
|
+
#####################################
|
502
|
+
|
503
|
+
# Run requests
|
504
|
+
tic = time.time()
|
505
|
+
rets = few_shot_hellaswag.run_batch(
|
506
|
+
arguments,
|
507
|
+
temperature=0,
|
508
|
+
num_threads=64,
|
509
|
+
progress_bar=True,
|
510
|
+
)
|
511
|
+
preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
|
512
|
+
latency = time.time() - tic
|
513
|
+
|
514
|
+
# Compute accuracy
|
515
|
+
accuracy = np.mean(np.array(preds) == np.array(labels))
|
516
|
+
|
517
|
+
return accuracy, latency
|
sglang/test/test_utils.py
CHANGED
@@ -23,6 +23,10 @@ from sglang.utils import get_exception_traceback
|
|
23
23
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
24
24
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
25
25
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
26
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
27
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
|
28
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
|
29
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
|
26
30
|
|
27
31
|
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
28
32
|
DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
|
sglang/utils.py
CHANGED
@@ -4,6 +4,7 @@ import base64
|
|
4
4
|
import importlib
|
5
5
|
import json
|
6
6
|
import logging
|
7
|
+
import os
|
7
8
|
import signal
|
8
9
|
import sys
|
9
10
|
import traceback
|
@@ -15,6 +16,7 @@ from typing import Union
|
|
15
16
|
|
16
17
|
import numpy as np
|
17
18
|
import requests
|
19
|
+
from tqdm import tqdm
|
18
20
|
|
19
21
|
logger = logging.getLogger(__name__)
|
20
22
|
|
@@ -260,3 +262,40 @@ class LazyImport:
|
|
260
262
|
def __call__(self, *args, **kwargs):
|
261
263
|
module = self._load()
|
262
264
|
return module(*args, **kwargs)
|
265
|
+
|
266
|
+
|
267
|
+
def fetch_and_cache_jsonl(url, cache_file="cached_data.jsonl"):
|
268
|
+
"""Read and cache a jsonl file from a url."""
|
269
|
+
|
270
|
+
# Check if the cache file already exists
|
271
|
+
if os.path.exists(cache_file):
|
272
|
+
print("Loading data from cache...")
|
273
|
+
with open(cache_file, "r") as f:
|
274
|
+
data = [json.loads(line) for line in f]
|
275
|
+
else:
|
276
|
+
print("Downloading data from URL...")
|
277
|
+
# Stream the response to show the progress bar
|
278
|
+
response = requests.get(url, stream=True)
|
279
|
+
response.raise_for_status() # Check for request errors
|
280
|
+
|
281
|
+
# Total size of the file in bytes
|
282
|
+
total_size = int(response.headers.get("content-length", 0))
|
283
|
+
chunk_size = 1024 # Download in chunks of 1KB
|
284
|
+
|
285
|
+
# Use tqdm to display the progress bar
|
286
|
+
with open(cache_file, "wb") as f, tqdm(
|
287
|
+
desc=cache_file,
|
288
|
+
total=total_size,
|
289
|
+
unit="B",
|
290
|
+
unit_scale=True,
|
291
|
+
unit_divisor=1024,
|
292
|
+
) as bar:
|
293
|
+
for chunk in response.iter_content(chunk_size=chunk_size):
|
294
|
+
f.write(chunk)
|
295
|
+
bar.update(len(chunk))
|
296
|
+
|
297
|
+
# Convert the data to a list of dictionaries
|
298
|
+
with open(cache_file, "r") as f:
|
299
|
+
data = [json.loads(line) for line in f]
|
300
|
+
|
301
|
+
return data
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.
|
1
|
+
__version__ = "0.3.0"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
312
312
|
### Method 2: From source
|
313
313
|
```
|
314
314
|
# Use the last release branch
|
315
|
-
git clone -b v0.
|
315
|
+
git clone -b v0.3.0 https://github.com/sgl-project/sglang.git
|
316
316
|
cd sglang
|
317
317
|
|
318
318
|
pip install --upgrade pip
|
@@ -461,7 +461,7 @@ It supports streaming, vision, and most features of the Chat/Completions/Models/
|
|
461
461
|
```
|
462
462
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
|
463
463
|
```
|
464
|
-
- Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism.
|
464
|
+
- Add `--dp 2` to enable multi-GPU data parallelism. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total.
|
465
465
|
```
|
466
466
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
|
467
467
|
```
|
@@ -489,13 +489,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
489
489
|
### Supported Models
|
490
490
|
|
491
491
|
**Generative Models**
|
492
|
-
|
493
492
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
494
493
|
- Mistral / Mixtral / Mistral NeMo
|
495
494
|
- Gemma / Gemma 2
|
496
495
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
497
496
|
- DeepSeek / DeepSeek 2
|
498
497
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
498
|
+
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
|
499
499
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
500
500
|
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
501
501
|
- LLaVA 1.5 / 1.6 / NeXT
|
@@ -509,6 +509,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
509
509
|
- Grok
|
510
510
|
- ChatGLM
|
511
511
|
- InternLM 2
|
512
|
+
- Exaone 3
|
512
513
|
|
513
514
|
**Embedding Models**
|
514
515
|
|
@@ -636,7 +637,7 @@ print(state["answer_1"])
|
|
636
637
|
#### More Examples
|
637
638
|
|
638
639
|
Anthropic and VertexAI (Gemini) models are also supported.
|
639
|
-
You can find more examples at [examples/quick_start](examples/quick_start).
|
640
|
+
You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
|
640
641
|
|
641
642
|
### Language Feature
|
642
643
|
To begin with, import sglang.
|
@@ -649,7 +650,7 @@ You can implement your prompt flow in a function decorated by `sgl.function`.
|
|
649
650
|
You can then invoke the function with `run` or `run_batch`.
|
650
651
|
The system will manage the state, chat template, parallelism and batching for you.
|
651
652
|
|
652
|
-
The complete code for the examples below can be found at [readme_examples.py](examples/usage/readme_examples.py)
|
653
|
+
The complete code for the examples below can be found at [readme_examples.py](examples/frontend_language/usage/readme_examples.py)
|
653
654
|
|
654
655
|
#### Control Flow
|
655
656
|
You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
|
@@ -698,7 +699,7 @@ def image_qa(s, image_file, question):
|
|
698
699
|
s += sgl.assistant(sgl.gen("answer", max_tokens=256)
|
699
700
|
```
|
700
701
|
|
701
|
-
See also [srt_example_llava.py](examples/quick_start/
|
702
|
+
See also [srt_example_llava.py](examples/frontend_language/quick_start/local_example_llava_next.py).
|
702
703
|
|
703
704
|
#### Constrained Decoding
|
704
705
|
Use `regex` to specify a regular expression as a decoding constraint.
|
@@ -742,7 +743,7 @@ def character_gen(s, name):
|
|
742
743
|
s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
|
743
744
|
```
|
744
745
|
|
745
|
-
See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
|
746
|
+
See also [json_decode.py](examples/frontend_language/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
|
746
747
|
|
747
748
|
#### Batching
|
748
749
|
Use `run_batch` to run a batch of requests with continuous batching.
|
@@ -0,0 +1,118 @@
|
|
1
|
+
sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
|
2
|
+
sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
|
3
|
+
sglang/bench_latency.py,sha256=F7jMfKqMf1XFKJgkpR_yE33VJpsIhSr_SOJeRbngkb0,16758
|
4
|
+
sglang/bench_serving.py,sha256=J_mMwnmDn0Jt07mzdGAuYOxpockHPLYJFL-kwoaqASY,36527
|
5
|
+
sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
|
6
|
+
sglang/global_config.py,sha256=nwOjUflwqLQySPUMvk8Hk63TIS6mknh_ODSW3CZ1rJw,1704
|
7
|
+
sglang/launch_server.py,sha256=FODfO0DW546dh-u1qDlWtrhsmj6hxkarXXv3cIdgkj8,549
|
8
|
+
sglang/launch_server_llavavid.py,sha256=xnpSILJxsrbvqkERav5P26bErCQnhoTFmoKeScJltUA,1034
|
9
|
+
sglang/utils.py,sha256=zxHwQhVxW_lWf-IH0wUw_pBTRLHLPypdRiU5M4XosMM,9669
|
10
|
+
sglang/version.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
|
11
|
+
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
|
+
sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
|
13
|
+
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
14
|
+
sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
|
15
|
+
sglang/lang/interpreter.py,sha256=AC3tNNDwYfiu87jCldBWXYpFicCv6NMPJACMFEfCXu4,30331
|
16
|
+
sglang/lang/ir.py,sha256=W3UfZikcGeT86PDDjDjw-yNzrKY2e2UYO4DTatMCfm0,17704
|
17
|
+
sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
|
18
|
+
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
+
sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
|
20
|
+
sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
|
21
|
+
sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
|
22
|
+
sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
|
23
|
+
sglang/lang/backend/runtime_endpoint.py,sha256=hpezro0H6vG9KzLeKfYpPMwb4TaE0UanCIM0uG8Kdjw,9746
|
24
|
+
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
25
|
+
sglang/srt/conversation.py,sha256=2KDNe1suUPy6xqSkCx2xcO3pDPxTwqx5FaUxaqwCJ-M,19525
|
26
|
+
sglang/srt/hf_transformers_utils.py,sha256=5UXJ-LdP92Sk_T843M9BHdnxRrcyiYfWH2IEg3dWgKI,6085
|
27
|
+
sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
|
28
|
+
sglang/srt/model_config.py,sha256=68QQ8iUWQHPv01RBeH23mvay6iJg9DWmCogC_vUgFLk,6371
|
29
|
+
sglang/srt/server.py,sha256=yi8prs9_M0P0dOInrQLkHKiZ-oTigk_uzW8otEHImbU,19846
|
30
|
+
sglang/srt/server_args.py,sha256=GiDyPWCvYA_98mSE9LuvUoEodo9gRnNPPIPn0nFkxUs,18259
|
31
|
+
sglang/srt/utils.py,sha256=JJOlqRPbN_tSSNWj63syQpfz4v7hUwNvzWvOUpBh9SM,23746
|
32
|
+
sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
|
33
|
+
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
34
|
+
sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
|
35
|
+
sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
|
36
|
+
sglang/srt/constrained/fsm_cache.py,sha256=wigJs9PeTt-vYPJQEeUZwEKl6MFIfb5xy8uIg18bDbM,3132
|
37
|
+
sglang/srt/constrained/jump_forward.py,sha256=LWRsmGPQcH6KT87wXwCRqtblU3pcAVCEzO0nWPxevs0,6636
|
38
|
+
sglang/srt/layers/activation.py,sha256=JEXNTgqxoiU4N-gVm4XMjobhft4JKDcMrgTkfpsRUzM,4856
|
39
|
+
sglang/srt/layers/decode_attention.py,sha256=TPD_608ZX9fQ_HDImifkxG_qcEYmimbEYY8lCBIjFuM,16628
|
40
|
+
sglang/srt/layers/extend_attention.py,sha256=XIXm3p2cvKrDg10Po4qYGaEkXJOJBtCIhTB_lTyjAFE,14390
|
41
|
+
sglang/srt/layers/layernorm.py,sha256=RXuS4UyksatqTF6lSK7VYyEiUEnBiNIBlEn8q4w84UA,3404
|
42
|
+
sglang/srt/layers/logits_processor.py,sha256=Zx4eFAkFlThPrmz_-HuCN9SqGLanARm0wdZSVDyASAc,13085
|
43
|
+
sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
|
44
|
+
sglang/srt/layers/prefill_attention.py,sha256=y7vdcuX8lMa9Qf_jQYNDvQO9PVCBQSs3hb5LV2DFgpU,5256
|
45
|
+
sglang/srt/layers/radix_attention.py,sha256=o5a8r3XQ-oRwaxBlAgzJGv7p3dMbu0LrYsDc4uvpPgA,8338
|
46
|
+
sglang/srt/layers/sampler.py,sha256=zPVa3PHc-tjDM_oP-1XFeHSRIErx844SLoe6MG8Qef0,6418
|
47
|
+
sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
|
48
|
+
sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
|
49
|
+
sglang/srt/layers/fused_moe/layer.py,sha256=GT3r2UPx_PAufJd0SUMOXyh76ymAeYDubd0SM0H71bo,20977
|
50
|
+
sglang/srt/managers/controller_multi.py,sha256=z3rguY1YYlSvVqLjKuurgJW1h0dxwPgIdPCQdJsVzYs,6478
|
51
|
+
sglang/srt/managers/controller_single.py,sha256=5brrZ8vZxjvrSJHWrm5H3qGEZShN4EROG5r1o3pSjps,5124
|
52
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
|
53
|
+
sglang/srt/managers/io_struct.py,sha256=Bd91cydX9_960NNP2xngqK-lsIaDB3oMYd56QddN4_Q,10722
|
54
|
+
sglang/srt/managers/policy_scheduler.py,sha256=7HNUxBKJE444s_bHcPpbnHCygsnH-NIXYNSC2q6mRmc,8584
|
55
|
+
sglang/srt/managers/schedule_batch.py,sha256=i68O-e9I_gDlme96xSBDjA2xDF1p-XBKvJRiJ9CsgcY,26423
|
56
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=8aHR5h9nYZsfdZE80uBc9egDFOQgKvjxmp-30Ha4ELk,29463
|
57
|
+
sglang/srt/managers/tp_worker.py,sha256=4UuaBLzV6NMsG4XEIcpa4xMcOKIFvTan51ynKz85HXg,36842
|
58
|
+
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
59
|
+
sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
|
60
|
+
sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
|
61
|
+
sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
|
62
|
+
sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
|
63
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=4vIUqVQpnHNhwWrokMVmGM4Dp5JFPHyXIvpEQsi2pNU,12862
|
64
|
+
sglang/srt/model_executor/forward_batch_info.py,sha256=fSLhatN8vCgxn0Mft9D-r0pNi3SN0EQSTJmgaOtrqJc,16471
|
65
|
+
sglang/srt/model_executor/model_runner.py,sha256=93YCStmZfdZlY0r-GGIVi0Xw66VwF77dEtGVmQf1VfU,23893
|
66
|
+
sglang/srt/models/chatglm.py,sha256=PPOaeqipbkcsTUhMPbLb1HItWgW7KntefUfjEoMSxUM,13585
|
67
|
+
sglang/srt/models/commandr.py,sha256=k86ykwWOlxLGaBbGUoMSaXngUxCbMVRbY5AoMOWpbU8,14377
|
68
|
+
sglang/srt/models/dbrx.py,sha256=goLJ9Yt-9vxkwhCUFBidvP41H_dYTFsvrMZ4xm4FqGA,14875
|
69
|
+
sglang/srt/models/deepseek.py,sha256=aYP6HUgxQbhcQGQEF4vX0ronBF8AirqIFG98EQn0YzY,16220
|
70
|
+
sglang/srt/models/deepseek_v2.py,sha256=Htw_HDju9huYU5gBu2dqq6bKVao-AsifxfkGl2xRx-8,28521
|
71
|
+
sglang/srt/models/exaone.py,sha256=ZFr0G0WITxg3dDfV_-vWqZpK_wMmiZi4r0vOT0gO9V4,13301
|
72
|
+
sglang/srt/models/gemma.py,sha256=Ya_u2lKPKAc9iHEsW_HAEfCDgYTbxUOCzBI0LDuoOYs,12489
|
73
|
+
sglang/srt/models/gemma2.py,sha256=MCmzzRAAafEQuQj6aGtB-TF4jH0RWrXcOPxSz6LRsXs,15137
|
74
|
+
sglang/srt/models/gpt_bigcode.py,sha256=HEhMRO1Y37JfZtP7mDp0MexWj5h6XT9rKvxorOMKoQA,10409
|
75
|
+
sglang/srt/models/grok.py,sha256=ZcJ4E11rKh-xo4k_j-H1XRreJWWv8yii-bMYC1lO2R8,15143
|
76
|
+
sglang/srt/models/internlm2.py,sha256=VtWATs2eLIqbadYXTPY_vycFIstVk4zg3kxycA9H0Qw,12416
|
77
|
+
sglang/srt/models/llama.py,sha256=MfDnlVWoJUG9DxgGYPiwhoU-0ZeRbhp6UmBR2ZAJSNk,13402
|
78
|
+
sglang/srt/models/llama_classification.py,sha256=oSeROs633Gnak8vrbnWnCWDxfgP_zmKGO1A_43ukEQ4,4029
|
79
|
+
sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
|
80
|
+
sglang/srt/models/llava.py,sha256=OXmlOVIjFnMRKGwLweYB1N-xlfpZlTlZpqhsbwUCY6Y,23471
|
81
|
+
sglang/srt/models/llavavid.py,sha256=4R2t8BZJKN85IrTLsLFb4yZuKVI2Cwp7kY8AJ-nEVoE,12012
|
82
|
+
sglang/srt/models/minicpm.py,sha256=7RZEJ2TCqBL1JmMFVJ3J9DmZHRw0q90st49Wkh-sdL4,14039
|
83
|
+
sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
|
84
|
+
sglang/srt/models/mixtral.py,sha256=KIsvruhXNq3Fwrs4_YE7J6fx54ObfnMuRNxgScE3Bmo,13830
|
85
|
+
sglang/srt/models/mixtral_quant.py,sha256=O_97UKDYZokFhIBnamWfw0HLhln9_BUk_KfQ-sQnd8s,14286
|
86
|
+
sglang/srt/models/qwen.py,sha256=geK88AyEyPbbDvMHJNY8XMSNpsCeu8g9kxnKyiJBpK4,10168
|
87
|
+
sglang/srt/models/qwen2.py,sha256=WGYy3wcRY3f8Drd9I8GblXfv0bbHluRKVhnnhEZf584,12654
|
88
|
+
sglang/srt/models/qwen2_moe.py,sha256=b0gd42GBWyvDmUu8BZbD9ZJO_ExbXBLQZRvu61UuXOA,17086
|
89
|
+
sglang/srt/models/stablelm.py,sha256=9feHoiDEXSIe0WCrt4AfWXqxliJwRvr8w4XSnk6ipSI,11573
|
90
|
+
sglang/srt/models/yivl.py,sha256=B6MELthWIm5KdSzX3o2tbbpApY8XdjUdmcQSD4dQe_I,4835
|
91
|
+
sglang/srt/openai_api/adapter.py,sha256=3EeqASZXogpUkOP4xj7Rg_LfOLiIMUrZ9uFdeAy_pcc,50144
|
92
|
+
sglang/srt/openai_api/protocol.py,sha256=onhnCjXpXCysvx_dLgOEmXz5XHHYB1t772cvHcK1GlY,9538
|
93
|
+
sglang/srt/sampling/sampling_batch_info.py,sha256=CIoD0SzHSWCe7Wc4jkJj5vIPHGnOdfbgkC6fG5KQxOw,7551
|
94
|
+
sglang/srt/sampling/sampling_params.py,sha256=ggOXxafqfCD-xrGYcM57byLZ79CIeBP4AD5F44L_CW0,5635
|
95
|
+
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
96
|
+
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
|
97
|
+
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
|
98
|
+
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
|
99
|
+
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
|
100
|
+
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
|
101
|
+
sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
|
102
|
+
sglang/test/runners.py,sha256=7N2g4vyqN98o6F0Lem5LUNAlW9ShEVxZxZuzSjmc0i4,7688
|
103
|
+
sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
|
104
|
+
sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
|
105
|
+
sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
|
106
|
+
sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
|
107
|
+
sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
|
108
|
+
sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
|
109
|
+
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
110
|
+
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
111
|
+
sglang/test/test_programs.py,sha256=l21J8N91QTMO9TOvXPWNvPZVT0DgxYxOPHh1pOoFV_k,16927
|
112
|
+
sglang/test/test_utils.py,sha256=3tt-BBv-lx7BT3whbVTMyRz6sh5jIbdBEbLZ08m2Ms8,15132
|
113
|
+
sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
|
114
|
+
sglang-0.3.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
115
|
+
sglang-0.3.0.dist-info/METADATA,sha256=muukBuN4kq_4mCG_r_RFY94pQliDcVh-WuXNMApXoak,37383
|
116
|
+
sglang-0.3.0.dist-info/WHEEL,sha256=uCRv0ZEik_232NlR4YDw4Pv3Ajt5bKvMH13NUU7hFuI,91
|
117
|
+
sglang-0.3.0.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
118
|
+
sglang-0.3.0.dist-info/RECORD,,
|