sglang 0.2.14.post1__py3-none-any.whl → 0.2.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +2 -0
- sglang/bench_latency.py +39 -28
- sglang/lang/interpreter.py +3 -0
- sglang/lang/ir.py +5 -0
- sglang/launch_server_llavavid.py +26 -0
- sglang/srt/configs/__init__.py +5 -0
- sglang/srt/configs/exaone.py +195 -0
- sglang/srt/constrained/fsm_cache.py +1 -1
- sglang/srt/conversation.py +24 -2
- sglang/srt/hf_transformers_utils.py +11 -160
- sglang/srt/layers/activation.py +10 -4
- sglang/srt/layers/extend_attention.py +13 -8
- sglang/srt/layers/layernorm.py +47 -1
- sglang/srt/layers/logits_processor.py +4 -4
- sglang/srt/layers/sampler.py +69 -16
- sglang/srt/managers/controller_multi.py +5 -5
- sglang/srt/managers/controller_single.py +5 -5
- sglang/srt/managers/io_struct.py +11 -5
- sglang/srt/managers/schedule_batch.py +25 -13
- sglang/srt/managers/tokenizer_manager.py +76 -63
- sglang/srt/managers/tp_worker.py +47 -36
- sglang/srt/model_config.py +3 -3
- sglang/srt/model_executor/cuda_graph_runner.py +24 -9
- sglang/srt/model_executor/forward_batch_info.py +78 -43
- sglang/srt/model_executor/model_runner.py +29 -18
- sglang/srt/models/chatglm.py +5 -13
- sglang/srt/models/commandr.py +5 -1
- sglang/srt/models/dbrx.py +5 -1
- sglang/srt/models/deepseek.py +5 -1
- sglang/srt/models/deepseek_v2.py +57 -25
- sglang/srt/models/exaone.py +399 -0
- sglang/srt/models/gemma.py +7 -3
- sglang/srt/models/gemma2.py +6 -52
- sglang/srt/models/gpt_bigcode.py +5 -1
- sglang/srt/models/grok.py +14 -4
- sglang/srt/models/internlm2.py +5 -1
- sglang/srt/models/llama2.py +10 -7
- sglang/srt/models/llama_classification.py +2 -6
- sglang/srt/models/llama_embedding.py +3 -4
- sglang/srt/models/llava.py +69 -91
- sglang/srt/models/llavavid.py +40 -86
- sglang/srt/models/minicpm.py +5 -1
- sglang/srt/models/mixtral.py +6 -2
- sglang/srt/models/mixtral_quant.py +5 -1
- sglang/srt/models/qwen.py +5 -2
- sglang/srt/models/qwen2.py +9 -6
- sglang/srt/models/qwen2_moe.py +12 -33
- sglang/srt/models/stablelm.py +5 -1
- sglang/srt/models/yivl.py +2 -7
- sglang/srt/openai_api/adapter.py +16 -1
- sglang/srt/openai_api/protocol.py +5 -5
- sglang/srt/sampling/sampling_batch_info.py +79 -6
- sglang/srt/server.py +9 -9
- sglang/srt/utils.py +18 -36
- sglang/test/runners.py +2 -2
- sglang/test/test_layernorm.py +53 -1
- sglang/version.py +1 -1
- {sglang-0.2.14.post1.dist-info → sglang-0.2.15.dist-info}/METADATA +8 -8
- sglang-0.2.15.dist-info/RECORD +118 -0
- sglang-0.2.14.post1.dist-info/RECORD +0 -114
- {sglang-0.2.14.post1.dist-info → sglang-0.2.15.dist-info}/LICENSE +0 -0
- {sglang-0.2.14.post1.dist-info → sglang-0.2.15.dist-info}/WHEEL +0 -0
- {sglang-0.2.14.post1.dist-info → sglang-0.2.15.dist-info}/top_level.txt +0 -0
@@ -21,10 +21,63 @@ class SamplingBatchInfo:
|
|
21
21
|
top_ps: torch.Tensor = None
|
22
22
|
top_ks: torch.Tensor = None
|
23
23
|
min_ps: torch.Tensor = None
|
24
|
-
|
24
|
+
|
25
|
+
# Dispatch in CUDA graph
|
26
|
+
need_min_p_sampling: bool = False
|
27
|
+
|
28
|
+
# Bias Tensors
|
25
29
|
logit_bias: torch.Tensor = None
|
26
30
|
vocab_mask: torch.Tensor = None
|
27
31
|
|
32
|
+
# Penalizer
|
33
|
+
penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
|
34
|
+
linear_penalties: torch.Tensor = None
|
35
|
+
scaling_penalties: torch.Tensor = None
|
36
|
+
|
37
|
+
def has_bias(self):
|
38
|
+
return (
|
39
|
+
self.logit_bias is not None
|
40
|
+
or self.vocab_mask is not None
|
41
|
+
or self.linear_penalties is not None
|
42
|
+
or self.scaling_penalties is not None
|
43
|
+
)
|
44
|
+
|
45
|
+
@classmethod
|
46
|
+
def dummy_one(cls, max_bs: int, vocab_size: int):
|
47
|
+
ret = cls(vocab_size=vocab_size)
|
48
|
+
ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
|
49
|
+
ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
|
50
|
+
ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
|
51
|
+
ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda")
|
52
|
+
return ret
|
53
|
+
|
54
|
+
def __getitem__(self, key):
|
55
|
+
if isinstance(key, slice):
|
56
|
+
# NOTE: We do not use cuda graph when there is bias tensors
|
57
|
+
assert not self.has_bias()
|
58
|
+
return SamplingBatchInfo(
|
59
|
+
vocab_size=self.vocab_size,
|
60
|
+
temperatures=self.temperatures[key],
|
61
|
+
top_ps=self.top_ps[key],
|
62
|
+
top_ks=self.top_ks[key],
|
63
|
+
min_ps=self.min_ps[key],
|
64
|
+
need_min_p_sampling=self.need_min_p_sampling,
|
65
|
+
)
|
66
|
+
else:
|
67
|
+
raise NotImplementedError
|
68
|
+
|
69
|
+
def inplace_assign(self, bs: int, other: SamplingBatchInfo):
|
70
|
+
# NOTE: We do not use cuda graph when there is bias tensors
|
71
|
+
assert not self.has_bias()
|
72
|
+
|
73
|
+
self.vocab_size = other.vocab_size
|
74
|
+
self.need_min_p_sampling = other.need_min_p_sampling
|
75
|
+
|
76
|
+
self.temperatures[:bs] = other.temperatures
|
77
|
+
self.top_ps[:bs] = other.top_ps
|
78
|
+
self.top_ks[:bs] = other.top_ks
|
79
|
+
self.min_ps[:bs] = other.min_ps
|
80
|
+
|
28
81
|
@classmethod
|
29
82
|
def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
|
30
83
|
device = "cuda"
|
@@ -45,6 +98,7 @@ class SamplingBatchInfo:
|
|
45
98
|
ret.min_ps = torch.tensor(
|
46
99
|
[r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
|
47
100
|
)
|
101
|
+
ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs)
|
48
102
|
|
49
103
|
# Each penalizers will do nothing if they evaluate themselves as not required by looking at
|
50
104
|
# the sampling_params of the requests (See {_is_required()} of each penalizers). So this
|
@@ -72,6 +126,25 @@ class SamplingBatchInfo:
|
|
72
126
|
|
73
127
|
return ret
|
74
128
|
|
129
|
+
def prepare_penalties(self):
|
130
|
+
self.scaling_penalties = None
|
131
|
+
self.linear_penalties = None
|
132
|
+
|
133
|
+
for penalizer in self.penalizer_orchestrator.penalizers.values():
|
134
|
+
if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer):
|
135
|
+
if penalizer.is_prepared():
|
136
|
+
self.scaling_penalties = penalizer.cumulated_repetition_penalties
|
137
|
+
else:
|
138
|
+
if penalizer.is_prepared():
|
139
|
+
if self.linear_penalties is None:
|
140
|
+
bs = self.penalizer_orchestrator.batch.batch_size()
|
141
|
+
self.linear_penalties = torch.zeros(
|
142
|
+
(bs, self.vocab_size),
|
143
|
+
dtype=torch.float32,
|
144
|
+
device="cuda",
|
145
|
+
)
|
146
|
+
self.linear_penalties = penalizer.apply(self.linear_penalties)
|
147
|
+
|
75
148
|
def update_regex_vocab_mask(self, batch: ScheduleBatch):
|
76
149
|
bs, reqs = batch.batch_size(), batch.reqs
|
77
150
|
device = "cuda"
|
@@ -81,15 +154,15 @@ class SamplingBatchInfo:
|
|
81
154
|
self.vocab_mask = None
|
82
155
|
|
83
156
|
if has_regex:
|
157
|
+
self.vocab_mask = torch.zeros(
|
158
|
+
bs, self.vocab_size, dtype=torch.bool, device=device
|
159
|
+
)
|
84
160
|
for i, req in enumerate(reqs):
|
85
161
|
if req.regex_fsm is not None:
|
86
|
-
|
87
|
-
self.vocab_mask = torch.zeros(
|
88
|
-
bs, self.vocab_size, dtype=torch.bool, device=device
|
89
|
-
)
|
162
|
+
self.vocab_mask[i].fill_(1)
|
90
163
|
self.vocab_mask[i][
|
91
164
|
req.regex_fsm.get_next_instruction(req.regex_fsm_state).tokens
|
92
|
-
] =
|
165
|
+
] = 0
|
93
166
|
|
94
167
|
def filter(self, unfinished_indices: List[int], new_indices: torch.Tensor):
|
95
168
|
self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
|
sglang/srt/server.py
CHANGED
@@ -272,7 +272,7 @@ async def retrieve_file_content(file_id: str):
|
|
272
272
|
|
273
273
|
def launch_server(
|
274
274
|
server_args: ServerArgs,
|
275
|
-
|
275
|
+
model_override_args: Optional[dict] = None,
|
276
276
|
pipe_finish_writer: Optional[mp.connection.Connection] = None,
|
277
277
|
):
|
278
278
|
"""Launch an HTTP server."""
|
@@ -317,7 +317,7 @@ def launch_server(
|
|
317
317
|
tp_rank_range,
|
318
318
|
server_args,
|
319
319
|
ports[3],
|
320
|
-
|
320
|
+
model_override_args,
|
321
321
|
)
|
322
322
|
|
323
323
|
try:
|
@@ -328,20 +328,20 @@ def launch_server(
|
|
328
328
|
return
|
329
329
|
|
330
330
|
# Launch processes
|
331
|
-
tokenizer_manager = TokenizerManager(server_args, port_args,
|
331
|
+
tokenizer_manager = TokenizerManager(server_args, port_args, model_override_args)
|
332
332
|
if server_args.chat_template:
|
333
333
|
load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
|
334
334
|
pipe_controller_reader, pipe_controller_writer = mp.Pipe(duplex=False)
|
335
335
|
pipe_detoken_reader, pipe_detoken_writer = mp.Pipe(duplex=False)
|
336
336
|
|
337
337
|
if server_args.dp_size == 1:
|
338
|
-
|
338
|
+
start_controller_process = start_controller_process_single
|
339
339
|
else:
|
340
|
-
|
340
|
+
start_controller_process = start_controller_process_multi
|
341
341
|
|
342
342
|
proc_controller = mp.Process(
|
343
|
-
target=
|
344
|
-
args=(server_args, port_args, pipe_controller_writer,
|
343
|
+
target=start_controller_process,
|
344
|
+
args=(server_args, port_args, pipe_controller_writer, model_override_args),
|
345
345
|
)
|
346
346
|
proc_controller.start()
|
347
347
|
|
@@ -501,7 +501,7 @@ class Runtime:
|
|
501
501
|
def __init__(
|
502
502
|
self,
|
503
503
|
log_level: str = "error",
|
504
|
-
|
504
|
+
model_override_args: Optional[dict] = None,
|
505
505
|
*args,
|
506
506
|
**kwargs,
|
507
507
|
):
|
@@ -525,7 +525,7 @@ class Runtime:
|
|
525
525
|
|
526
526
|
proc = mp.Process(
|
527
527
|
target=launch_server,
|
528
|
-
args=(self.server_args,
|
528
|
+
args=(self.server_args, model_override_args, pipe_writer),
|
529
529
|
)
|
530
530
|
proc.start()
|
531
531
|
pipe_writer.close()
|
sglang/srt/utils.py
CHANGED
@@ -26,7 +26,7 @@ import struct
|
|
26
26
|
import time
|
27
27
|
from importlib.metadata import PackageNotFoundError, version
|
28
28
|
from io import BytesIO
|
29
|
-
from typing import List, Optional
|
29
|
+
from typing import List, Optional, Union
|
30
30
|
|
31
31
|
import numpy as np
|
32
32
|
import psutil
|
@@ -193,35 +193,16 @@ def allocate_init_ports(
|
|
193
193
|
return ret_ports[0], ret_ports[1:num_ports_needed]
|
194
194
|
|
195
195
|
|
196
|
-
def
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
logit_bias[t_id] = -1e5
|
207
|
-
|
208
|
-
return logit_bias
|
209
|
-
|
210
|
-
|
211
|
-
def is_multimodal_model(model):
|
212
|
-
from sglang.srt.model_config import ModelConfig
|
213
|
-
|
214
|
-
if isinstance(model, str):
|
215
|
-
model = model.lower()
|
216
|
-
return "llava" in model or "yi-vl" in model or "llava-next" in model
|
217
|
-
|
218
|
-
if isinstance(model, ModelConfig):
|
219
|
-
model_path = model.path.lower()
|
220
|
-
return (
|
221
|
-
"llava" in model_path or "yi-vl" in model_path or "llava-next" in model_path
|
222
|
-
)
|
223
|
-
|
224
|
-
raise ValueError("unrecognized type")
|
196
|
+
def is_multimodal_model(model_architectures):
|
197
|
+
if (
|
198
|
+
"LlavaLlamaForCausalLM" in model_architectures
|
199
|
+
or "LlavaQwenForCausalLM" in model_architectures
|
200
|
+
or "LlavaMistralForCausalLM" in model_architectures
|
201
|
+
or "LlavaVidForCausalLM" in model_architectures
|
202
|
+
):
|
203
|
+
return True
|
204
|
+
else:
|
205
|
+
return False
|
225
206
|
|
226
207
|
|
227
208
|
def is_generation_model(model_architectures, is_embedding: bool = False):
|
@@ -317,12 +298,14 @@ def decode_video_base64(video_base64):
|
|
317
298
|
) # Return an empty array and size tuple if no frames were found
|
318
299
|
|
319
300
|
|
320
|
-
def load_image(image_file):
|
301
|
+
def load_image(image_file: Union[str, bytes]):
|
321
302
|
from PIL import Image
|
322
303
|
|
323
304
|
image = image_size = None
|
324
305
|
|
325
|
-
if image_file
|
306
|
+
if isinstance(image_file, bytes):
|
307
|
+
image = Image.open(BytesIO(image_file))
|
308
|
+
elif image_file.startswith("http://") or image_file.startswith("https://"):
|
326
309
|
timeout = int(os.getenv("REQUEST_TIMEOUT", "3"))
|
327
310
|
response = requests.get(image_file, timeout=timeout)
|
328
311
|
image = Image.open(BytesIO(response.content))
|
@@ -334,8 +317,10 @@ def load_image(image_file):
|
|
334
317
|
elif image_file.startswith("video:"):
|
335
318
|
image_file = image_file.replace("video:", "")
|
336
319
|
image, image_size = decode_video_base64(image_file)
|
337
|
-
|
320
|
+
elif isinstance(image_file, str):
|
338
321
|
image = Image.open(BytesIO(base64.b64decode(image_file)))
|
322
|
+
else:
|
323
|
+
raise ValueError(f"Invalid image: {image}")
|
339
324
|
|
340
325
|
return image, image_size
|
341
326
|
|
@@ -422,7 +407,6 @@ def monkey_patch_vllm_dummy_weight_loader():
|
|
422
407
|
DummyModelLoader,
|
423
408
|
LoRAConfig,
|
424
409
|
ModelConfig,
|
425
|
-
MultiModalConfig,
|
426
410
|
ParallelConfig,
|
427
411
|
SchedulerConfig,
|
428
412
|
_initialize_model,
|
@@ -437,7 +421,6 @@ def monkey_patch_vllm_dummy_weight_loader():
|
|
437
421
|
model_config: ModelConfig,
|
438
422
|
device_config: DeviceConfig,
|
439
423
|
lora_config: Optional[LoRAConfig],
|
440
|
-
multimodal_config: Optional[MultiModalConfig],
|
441
424
|
parallel_config: ParallelConfig,
|
442
425
|
scheduler_config: SchedulerConfig,
|
443
426
|
cache_config: CacheConfig,
|
@@ -448,7 +431,6 @@ def monkey_patch_vllm_dummy_weight_loader():
|
|
448
431
|
model_config,
|
449
432
|
self.load_config,
|
450
433
|
lora_config,
|
451
|
-
multimodal_config,
|
452
434
|
cache_config,
|
453
435
|
)
|
454
436
|
|
sglang/test/runners.py
CHANGED
@@ -30,7 +30,7 @@ DEFAULT_PROMPTS = [
|
|
30
30
|
# the output of gemma-2-2b from SRT is unstable on the commented prompt
|
31
31
|
# "The capital of France is",
|
32
32
|
"Apple is red. Banana is Yellow. " * 800 + "Apple is",
|
33
|
-
"The capital of the United
|
33
|
+
"The capital of the United Kingdom is",
|
34
34
|
"Today is a sunny day and I like",
|
35
35
|
"AI is a field of computer science focused on",
|
36
36
|
]
|
@@ -180,7 +180,7 @@ class SRTRunner:
|
|
180
180
|
tp_size=tp_size,
|
181
181
|
dtype=get_dtype_str(torch_dtype),
|
182
182
|
port=port,
|
183
|
-
mem_fraction_static=0.
|
183
|
+
mem_fraction_static=0.69,
|
184
184
|
trust_remote_code=False,
|
185
185
|
is_embedding=not self.is_generation,
|
186
186
|
)
|
sglang/test/test_layernorm.py
CHANGED
@@ -3,7 +3,7 @@ import unittest
|
|
3
3
|
|
4
4
|
import torch
|
5
5
|
|
6
|
-
from sglang.srt.layers.layernorm import RMSNorm
|
6
|
+
from sglang.srt.layers.layernorm import GemmaRMSNorm, RMSNorm
|
7
7
|
|
8
8
|
|
9
9
|
class TestRMSNorm(unittest.TestCase):
|
@@ -56,5 +56,57 @@ class TestRMSNorm(unittest.TestCase):
|
|
56
56
|
self._run_rms_norm_test(*params)
|
57
57
|
|
58
58
|
|
59
|
+
class TestGemmaRMSNorm(unittest.TestCase):
|
60
|
+
DTYPES = [torch.half, torch.bfloat16]
|
61
|
+
NUM_TOKENS = [7, 83, 4096]
|
62
|
+
HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
|
63
|
+
ADD_RESIDUAL = [False, True]
|
64
|
+
SEEDS = [0]
|
65
|
+
|
66
|
+
@classmethod
|
67
|
+
def setUpClass(cls):
|
68
|
+
if not torch.cuda.is_available():
|
69
|
+
raise unittest.SkipTest("CUDA is not available")
|
70
|
+
torch.set_default_device("cuda")
|
71
|
+
|
72
|
+
def _run_gemma_rms_norm_test(
|
73
|
+
self, num_tokens, hidden_size, add_residual, dtype, seed
|
74
|
+
):
|
75
|
+
torch.manual_seed(seed)
|
76
|
+
|
77
|
+
layer = GemmaRMSNorm(hidden_size).to(dtype=dtype)
|
78
|
+
layer.weight.data.normal_(mean=1.0, std=0.1)
|
79
|
+
scale = 1 / (2 * hidden_size)
|
80
|
+
x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
|
81
|
+
residual = torch.randn_like(x) * scale if add_residual else None
|
82
|
+
|
83
|
+
with torch.inference_mode():
|
84
|
+
ref_out = layer.forward_native(x, residual)
|
85
|
+
out = layer(x, residual)
|
86
|
+
|
87
|
+
if add_residual:
|
88
|
+
self.assertTrue(torch.allclose(out[0], ref_out[0], atol=1e-3, rtol=1e-3))
|
89
|
+
self.assertTrue(torch.allclose(out[1], ref_out[1], atol=1e-3, rtol=1e-3))
|
90
|
+
else:
|
91
|
+
self.assertTrue(torch.allclose(out, ref_out, atol=1e-3, rtol=1e-3))
|
92
|
+
|
93
|
+
def test_gemma_rms_norm(self):
|
94
|
+
for params in itertools.product(
|
95
|
+
self.NUM_TOKENS,
|
96
|
+
self.HIDDEN_SIZES,
|
97
|
+
self.ADD_RESIDUAL,
|
98
|
+
self.DTYPES,
|
99
|
+
self.SEEDS,
|
100
|
+
):
|
101
|
+
with self.subTest(
|
102
|
+
num_tokens=params[0],
|
103
|
+
hidden_size=params[1],
|
104
|
+
add_residual=params[2],
|
105
|
+
dtype=params[3],
|
106
|
+
seed=params[4],
|
107
|
+
):
|
108
|
+
self._run_gemma_rms_norm_test(*params)
|
109
|
+
|
110
|
+
|
59
111
|
if __name__ == "__main__":
|
60
112
|
unittest.main(verbosity=2)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.
|
1
|
+
__version__ = "0.2.15"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.15
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
312
312
|
### Method 2: From source
|
313
313
|
```
|
314
314
|
# Use the last release branch
|
315
|
-
git clone -b v0.2.
|
315
|
+
git clone -b v0.2.15 https://github.com/sgl-project/sglang.git
|
316
316
|
cd sglang
|
317
317
|
|
318
318
|
pip install --upgrade pip
|
@@ -489,14 +489,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
489
489
|
### Supported Models
|
490
490
|
|
491
491
|
**Generative Models**
|
492
|
-
|
493
492
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
494
493
|
- Mistral / Mixtral / Mistral NeMo
|
495
494
|
- Gemma / Gemma 2
|
496
495
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
497
496
|
- DeepSeek / DeepSeek 2
|
498
497
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
499
|
-
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava
|
498
|
+
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
500
499
|
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
501
500
|
- LLaVA 1.5 / 1.6 / NeXT
|
502
501
|
- `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
|
@@ -509,6 +508,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
509
508
|
- Grok
|
510
509
|
- ChatGLM
|
511
510
|
- InternLM 2
|
511
|
+
- Exaone 3
|
512
512
|
|
513
513
|
**Embedding Models**
|
514
514
|
|
@@ -636,7 +636,7 @@ print(state["answer_1"])
|
|
636
636
|
#### More Examples
|
637
637
|
|
638
638
|
Anthropic and VertexAI (Gemini) models are also supported.
|
639
|
-
You can find more examples at [examples/quick_start](examples/quick_start).
|
639
|
+
You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
|
640
640
|
|
641
641
|
### Language Feature
|
642
642
|
To begin with, import sglang.
|
@@ -649,7 +649,7 @@ You can implement your prompt flow in a function decorated by `sgl.function`.
|
|
649
649
|
You can then invoke the function with `run` or `run_batch`.
|
650
650
|
The system will manage the state, chat template, parallelism and batching for you.
|
651
651
|
|
652
|
-
The complete code for the examples below can be found at [readme_examples.py](examples/usage/readme_examples.py)
|
652
|
+
The complete code for the examples below can be found at [readme_examples.py](examples/frontend_language/usage/readme_examples.py)
|
653
653
|
|
654
654
|
#### Control Flow
|
655
655
|
You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
|
@@ -698,7 +698,7 @@ def image_qa(s, image_file, question):
|
|
698
698
|
s += sgl.assistant(sgl.gen("answer", max_tokens=256)
|
699
699
|
```
|
700
700
|
|
701
|
-
See also [srt_example_llava.py](examples/quick_start/
|
701
|
+
See also [srt_example_llava.py](examples/frontend_language/quick_start/local_example_llava_next.py).
|
702
702
|
|
703
703
|
#### Constrained Decoding
|
704
704
|
Use `regex` to specify a regular expression as a decoding constraint.
|
@@ -742,7 +742,7 @@ def character_gen(s, name):
|
|
742
742
|
s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
|
743
743
|
```
|
744
744
|
|
745
|
-
See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
|
745
|
+
See also [json_decode.py](examples/frontend_language/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
|
746
746
|
|
747
747
|
#### Batching
|
748
748
|
Use `run_batch` to run a batch of requests with continuous batching.
|
@@ -0,0 +1,118 @@
|
|
1
|
+
sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
|
2
|
+
sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
|
3
|
+
sglang/bench_latency.py,sha256=F7jMfKqMf1XFKJgkpR_yE33VJpsIhSr_SOJeRbngkb0,16758
|
4
|
+
sglang/bench_serving.py,sha256=J_mMwnmDn0Jt07mzdGAuYOxpockHPLYJFL-kwoaqASY,36527
|
5
|
+
sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
|
6
|
+
sglang/global_config.py,sha256=nwOjUflwqLQySPUMvk8Hk63TIS6mknh_ODSW3CZ1rJw,1704
|
7
|
+
sglang/launch_server.py,sha256=FODfO0DW546dh-u1qDlWtrhsmj6hxkarXXv3cIdgkj8,549
|
8
|
+
sglang/launch_server_llavavid.py,sha256=xnpSILJxsrbvqkERav5P26bErCQnhoTFmoKeScJltUA,1034
|
9
|
+
sglang/utils.py,sha256=zFYGkC4vOUR3sTv1TmQXcsOLZDtDBR3wnjqnDp3xMIs,8352
|
10
|
+
sglang/version.py,sha256=ogr0x4sazo5ruMrKOQDYO_YrTwtaXZTE8fKnwCajH7I,23
|
11
|
+
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
|
+
sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
|
13
|
+
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
14
|
+
sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
|
15
|
+
sglang/lang/interpreter.py,sha256=AC3tNNDwYfiu87jCldBWXYpFicCv6NMPJACMFEfCXu4,30331
|
16
|
+
sglang/lang/ir.py,sha256=W3UfZikcGeT86PDDjDjw-yNzrKY2e2UYO4DTatMCfm0,17704
|
17
|
+
sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
|
18
|
+
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
+
sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
|
20
|
+
sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
|
21
|
+
sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
|
22
|
+
sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
|
23
|
+
sglang/lang/backend/runtime_endpoint.py,sha256=SDlp03EuQEK1eGK4_IaFySWgxlp4wCs3EPewZ6O640E,9549
|
24
|
+
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
25
|
+
sglang/srt/conversation.py,sha256=2KDNe1suUPy6xqSkCx2xcO3pDPxTwqx5FaUxaqwCJ-M,19525
|
26
|
+
sglang/srt/hf_transformers_utils.py,sha256=kNGJ5OfAth7dZrWfhpKpt7s2LQWvLH2d-v0GtcEs3R0,6078
|
27
|
+
sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
|
28
|
+
sglang/srt/model_config.py,sha256=68QQ8iUWQHPv01RBeH23mvay6iJg9DWmCogC_vUgFLk,6371
|
29
|
+
sglang/srt/server.py,sha256=yi8prs9_M0P0dOInrQLkHKiZ-oTigk_uzW8otEHImbU,19846
|
30
|
+
sglang/srt/server_args.py,sha256=GiDyPWCvYA_98mSE9LuvUoEodo9gRnNPPIPn0nFkxUs,18259
|
31
|
+
sglang/srt/utils.py,sha256=JJOlqRPbN_tSSNWj63syQpfz4v7hUwNvzWvOUpBh9SM,23746
|
32
|
+
sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
|
33
|
+
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
34
|
+
sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
|
35
|
+
sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
|
36
|
+
sglang/srt/constrained/fsm_cache.py,sha256=wigJs9PeTt-vYPJQEeUZwEKl6MFIfb5xy8uIg18bDbM,3132
|
37
|
+
sglang/srt/constrained/jump_forward.py,sha256=LWRsmGPQcH6KT87wXwCRqtblU3pcAVCEzO0nWPxevs0,6636
|
38
|
+
sglang/srt/layers/activation.py,sha256=JEXNTgqxoiU4N-gVm4XMjobhft4JKDcMrgTkfpsRUzM,4856
|
39
|
+
sglang/srt/layers/decode_attention.py,sha256=TPD_608ZX9fQ_HDImifkxG_qcEYmimbEYY8lCBIjFuM,16628
|
40
|
+
sglang/srt/layers/extend_attention.py,sha256=XIXm3p2cvKrDg10Po4qYGaEkXJOJBtCIhTB_lTyjAFE,14390
|
41
|
+
sglang/srt/layers/layernorm.py,sha256=RXuS4UyksatqTF6lSK7VYyEiUEnBiNIBlEn8q4w84UA,3404
|
42
|
+
sglang/srt/layers/logits_processor.py,sha256=Zx4eFAkFlThPrmz_-HuCN9SqGLanARm0wdZSVDyASAc,13085
|
43
|
+
sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
|
44
|
+
sglang/srt/layers/prefill_attention.py,sha256=y7vdcuX8lMa9Qf_jQYNDvQO9PVCBQSs3hb5LV2DFgpU,5256
|
45
|
+
sglang/srt/layers/radix_attention.py,sha256=o5a8r3XQ-oRwaxBlAgzJGv7p3dMbu0LrYsDc4uvpPgA,8338
|
46
|
+
sglang/srt/layers/sampler.py,sha256=YEDZrwzshX-fZZ5tkW57yBBIJRu2SPAUZzXhhrpQs4Q,5543
|
47
|
+
sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
|
48
|
+
sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
|
49
|
+
sglang/srt/layers/fused_moe/layer.py,sha256=GT3r2UPx_PAufJd0SUMOXyh76ymAeYDubd0SM0H71bo,20977
|
50
|
+
sglang/srt/managers/controller_multi.py,sha256=z3rguY1YYlSvVqLjKuurgJW1h0dxwPgIdPCQdJsVzYs,6478
|
51
|
+
sglang/srt/managers/controller_single.py,sha256=5brrZ8vZxjvrSJHWrm5H3qGEZShN4EROG5r1o3pSjps,5124
|
52
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
|
53
|
+
sglang/srt/managers/io_struct.py,sha256=Bd91cydX9_960NNP2xngqK-lsIaDB3oMYd56QddN4_Q,10722
|
54
|
+
sglang/srt/managers/policy_scheduler.py,sha256=7HNUxBKJE444s_bHcPpbnHCygsnH-NIXYNSC2q6mRmc,8584
|
55
|
+
sglang/srt/managers/schedule_batch.py,sha256=D3NBNi_6_KEMfBTn_8XPrtCbXHjnUki0sOVhQ7kgqqM,26182
|
56
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=ung-uQrvtPn-vzpQMjpYW_jKWDJR_B8NL88WW3OWyy0,29435
|
57
|
+
sglang/srt/managers/tp_worker.py,sha256=4UuaBLzV6NMsG4XEIcpa4xMcOKIFvTan51ynKz85HXg,36842
|
58
|
+
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
59
|
+
sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
|
60
|
+
sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
|
61
|
+
sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
|
62
|
+
sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
|
63
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=qyKjW9TjSjZ-NZI3aspJwnmuKSKT6DX1MMTFwqJtNE8,12751
|
64
|
+
sglang/srt/model_executor/forward_batch_info.py,sha256=fSLhatN8vCgxn0Mft9D-r0pNi3SN0EQSTJmgaOtrqJc,16471
|
65
|
+
sglang/srt/model_executor/model_runner.py,sha256=9ard4FLjb_rz0EUS3KMrlDkos0zNGh5TQ6wlHSIsev4,24408
|
66
|
+
sglang/srt/models/chatglm.py,sha256=BzLtDK_CsD1Pmn-sHnJuLulJCUuSbNm1q1fqCShRdQ8,13628
|
67
|
+
sglang/srt/models/commandr.py,sha256=k86ykwWOlxLGaBbGUoMSaXngUxCbMVRbY5AoMOWpbU8,14377
|
68
|
+
sglang/srt/models/dbrx.py,sha256=goLJ9Yt-9vxkwhCUFBidvP41H_dYTFsvrMZ4xm4FqGA,14875
|
69
|
+
sglang/srt/models/deepseek.py,sha256=aYP6HUgxQbhcQGQEF4vX0ronBF8AirqIFG98EQn0YzY,16220
|
70
|
+
sglang/srt/models/deepseek_v2.py,sha256=Htw_HDju9huYU5gBu2dqq6bKVao-AsifxfkGl2xRx-8,28521
|
71
|
+
sglang/srt/models/exaone.py,sha256=58JELgg-dZl6CUNd2PEWR0ok9u4osOuE5QKSfX6MzhE,14480
|
72
|
+
sglang/srt/models/gemma.py,sha256=Ya_u2lKPKAc9iHEsW_HAEfCDgYTbxUOCzBI0LDuoOYs,12489
|
73
|
+
sglang/srt/models/gemma2.py,sha256=MCmzzRAAafEQuQj6aGtB-TF4jH0RWrXcOPxSz6LRsXs,15137
|
74
|
+
sglang/srt/models/gpt_bigcode.py,sha256=HEhMRO1Y37JfZtP7mDp0MexWj5h6XT9rKvxorOMKoQA,10409
|
75
|
+
sglang/srt/models/grok.py,sha256=ZcJ4E11rKh-xo4k_j-H1XRreJWWv8yii-bMYC1lO2R8,15143
|
76
|
+
sglang/srt/models/internlm2.py,sha256=VtWATs2eLIqbadYXTPY_vycFIstVk4zg3kxycA9H0Qw,12416
|
77
|
+
sglang/srt/models/llama2.py,sha256=NriIElOdhhsiJFmNPc4bDXjxU_FgqfqdtoagSuIcnnc,14394
|
78
|
+
sglang/srt/models/llama_classification.py,sha256=ClNlaLi3Z0ME1ETOwGxl8DtJy8VJu8kobVRFX9jKJqM,4704
|
79
|
+
sglang/srt/models/llama_embedding.py,sha256=Z3FWGNEWrperMxnVqOhxv6vApNpChh-AaahlEqeYOrk,3574
|
80
|
+
sglang/srt/models/llava.py,sha256=ypq0hWprqN73P-VuYfSAZ1_Otm48qDqEPA2YO583goM,23453
|
81
|
+
sglang/srt/models/llavavid.py,sha256=Dx_wED6stC8lTASUrGt6B3c8wQ9lVrX-76-dNyyuVVg,11934
|
82
|
+
sglang/srt/models/minicpm.py,sha256=7RZEJ2TCqBL1JmMFVJ3J9DmZHRw0q90st49Wkh-sdL4,14039
|
83
|
+
sglang/srt/models/mistral.py,sha256=jlrWBVNXbAUziAaIdHAjFcOJnKtn9Bl8rBd65ypJM-I,819
|
84
|
+
sglang/srt/models/mixtral.py,sha256=KIsvruhXNq3Fwrs4_YE7J6fx54ObfnMuRNxgScE3Bmo,13830
|
85
|
+
sglang/srt/models/mixtral_quant.py,sha256=O_97UKDYZokFhIBnamWfw0HLhln9_BUk_KfQ-sQnd8s,14286
|
86
|
+
sglang/srt/models/qwen.py,sha256=geK88AyEyPbbDvMHJNY8XMSNpsCeu8g9kxnKyiJBpK4,10168
|
87
|
+
sglang/srt/models/qwen2.py,sha256=WGYy3wcRY3f8Drd9I8GblXfv0bbHluRKVhnnhEZf584,12654
|
88
|
+
sglang/srt/models/qwen2_moe.py,sha256=b0gd42GBWyvDmUu8BZbD9ZJO_ExbXBLQZRvu61UuXOA,17086
|
89
|
+
sglang/srt/models/stablelm.py,sha256=9feHoiDEXSIe0WCrt4AfWXqxliJwRvr8w4XSnk6ipSI,11573
|
90
|
+
sglang/srt/models/yivl.py,sha256=B6MELthWIm5KdSzX3o2tbbpApY8XdjUdmcQSD4dQe_I,4835
|
91
|
+
sglang/srt/openai_api/adapter.py,sha256=3EeqASZXogpUkOP4xj7Rg_LfOLiIMUrZ9uFdeAy_pcc,50144
|
92
|
+
sglang/srt/openai_api/protocol.py,sha256=onhnCjXpXCysvx_dLgOEmXz5XHHYB1t772cvHcK1GlY,9538
|
93
|
+
sglang/srt/sampling/sampling_batch_info.py,sha256=WO7fgURK7XqXU3jORWpkz7Tyx3FC34r--hPMKvkt4Iw,7735
|
94
|
+
sglang/srt/sampling/sampling_params.py,sha256=ggOXxafqfCD-xrGYcM57byLZ79CIeBP4AD5F44L_CW0,5635
|
95
|
+
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
96
|
+
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
|
97
|
+
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
|
98
|
+
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
|
99
|
+
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
|
100
|
+
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
|
101
|
+
sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
|
102
|
+
sglang/test/runners.py,sha256=7N2g4vyqN98o6F0Lem5LUNAlW9ShEVxZxZuzSjmc0i4,7688
|
103
|
+
sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
|
104
|
+
sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
|
105
|
+
sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
|
106
|
+
sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
|
107
|
+
sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
|
108
|
+
sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
|
109
|
+
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
110
|
+
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
111
|
+
sglang/test/test_programs.py,sha256=V_-Bx3lLkw37P6gDyA7mZCqxlyNMaFLBkRrPMQQQqn4,14909
|
112
|
+
sglang/test/test_utils.py,sha256=HD-9rcj7EFS_NX1GQFU5613ITQlZaTK2l9RmqA0F7x4,14380
|
113
|
+
sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
|
114
|
+
sglang-0.2.15.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
115
|
+
sglang-0.2.15.dist-info/METADATA,sha256=bmiMZPX1vW_NYDBk92pG1u9_PZRcXanJ2KXtxBmaiF4,37211
|
116
|
+
sglang-0.2.15.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
|
117
|
+
sglang-0.2.15.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
118
|
+
sglang-0.2.15.dist-info/RECORD,,
|