sglang 0.2.14.post1__py3-none-any.whl → 0.2.14.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/launch_server_llavavid.py +26 -0
- sglang/srt/hf_transformers_utils.py +0 -149
- sglang/srt/layers/activation.py +10 -4
- sglang/srt/layers/layernorm.py +47 -1
- sglang/srt/managers/io_struct.py +5 -4
- sglang/srt/managers/schedule_batch.py +5 -5
- sglang/srt/managers/tokenizer_manager.py +74 -61
- sglang/srt/managers/tp_worker.py +9 -10
- sglang/srt/model_executor/forward_batch_info.py +10 -20
- sglang/srt/model_executor/model_runner.py +15 -6
- sglang/srt/models/chatglm.py +1 -1
- sglang/srt/models/gemma.py +2 -2
- sglang/srt/models/gemma2.py +1 -51
- sglang/srt/models/grok.py +9 -3
- sglang/srt/models/llama2.py +3 -4
- sglang/srt/models/llama_classification.py +0 -4
- sglang/srt/models/llama_embedding.py +3 -4
- sglang/srt/models/llava.py +69 -91
- sglang/srt/models/llavavid.py +40 -86
- sglang/srt/models/qwen2.py +3 -4
- sglang/srt/models/qwen2_moe.py +7 -19
- sglang/srt/models/yivl.py +2 -7
- sglang/srt/server.py +3 -3
- sglang/srt/utils.py +18 -33
- sglang/test/runners.py +1 -1
- sglang/test/test_layernorm.py +53 -1
- sglang/version.py +1 -1
- {sglang-0.2.14.post1.dist-info → sglang-0.2.14.post2.dist-info}/METADATA +3 -3
- {sglang-0.2.14.post1.dist-info → sglang-0.2.14.post2.dist-info}/RECORD +32 -31
- {sglang-0.2.14.post1.dist-info → sglang-0.2.14.post2.dist-info}/LICENSE +0 -0
- {sglang-0.2.14.post1.dist-info → sglang-0.2.14.post2.dist-info}/WHEEL +0 -0
- {sglang-0.2.14.post1.dist-info → sglang-0.2.14.post2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,26 @@
|
|
1
|
+
"""Launch the inference server for Llava-video model."""
|
2
|
+
|
3
|
+
import argparse
|
4
|
+
|
5
|
+
from sglang.srt.server import ServerArgs, launch_server
|
6
|
+
|
7
|
+
if __name__ == "__main__":
|
8
|
+
parser = argparse.ArgumentParser()
|
9
|
+
ServerArgs.add_cli_args(parser)
|
10
|
+
args = parser.parse_args()
|
11
|
+
server_args = ServerArgs.from_cli_args(args)
|
12
|
+
|
13
|
+
model_overide_args = {}
|
14
|
+
model_overide_args["mm_spatial_pool_stride"] = 2
|
15
|
+
model_overide_args["architectures"] = ["LlavaVidForCausalLM"]
|
16
|
+
model_overide_args["num_frames"] = 16
|
17
|
+
model_overide_args["model_type"] = "llavavid"
|
18
|
+
if model_overide_args["num_frames"] == 32:
|
19
|
+
model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
|
20
|
+
model_overide_args["max_sequence_length"] = 4096 * 2
|
21
|
+
model_overide_args["tokenizer_model_max_length"] = 4096 * 2
|
22
|
+
model_overide_args["model_max_length"] = 4096 * 2
|
23
|
+
if "34b" in args.model_path.lower():
|
24
|
+
model_overide_args["image_token_index"] = 64002
|
25
|
+
|
26
|
+
launch_server(server_args, model_overide_args, None)
|
@@ -119,24 +119,7 @@ def get_tokenizer(
|
|
119
119
|
tokenizer_revision: Optional[str] = None,
|
120
120
|
**kwargs,
|
121
121
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
122
|
-
if tokenizer_name.endswith(".json"):
|
123
|
-
return TiktokenTokenizer(tokenizer_name)
|
124
|
-
|
125
|
-
if tokenizer_name.endswith(".model"):
|
126
|
-
return SentencePieceTokenizer(tokenizer_name)
|
127
|
-
|
128
122
|
"""Gets a tokenizer for the given model name via Huggingface."""
|
129
|
-
if is_multimodal_model(tokenizer_name):
|
130
|
-
processor = get_processor(
|
131
|
-
tokenizer_name,
|
132
|
-
*args,
|
133
|
-
trust_remote_code=trust_remote_code,
|
134
|
-
tokenizer_revision=tokenizer_revision,
|
135
|
-
**kwargs,
|
136
|
-
)
|
137
|
-
tokenizer = processor.tokenizer
|
138
|
-
return tokenizer
|
139
|
-
|
140
123
|
if tokenizer_mode == "slow":
|
141
124
|
if kwargs.get("use_fast", False):
|
142
125
|
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
|
@@ -199,135 +182,3 @@ def get_processor(
|
|
199
182
|
**kwargs,
|
200
183
|
)
|
201
184
|
return processor
|
202
|
-
|
203
|
-
|
204
|
-
class TiktokenTokenizer:
|
205
|
-
def __init__(self, tokenizer_path):
|
206
|
-
import tiktoken
|
207
|
-
from jinja2 import Template
|
208
|
-
|
209
|
-
PAT_STR_B = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
|
210
|
-
|
211
|
-
# Read JSON
|
212
|
-
name = "tmp-json"
|
213
|
-
with open(tokenizer_path, "rb") as fin:
|
214
|
-
tok_dict = json.load(fin)
|
215
|
-
|
216
|
-
mergeable_ranks = {
|
217
|
-
bytes(item["bytes"]): item["token"] for item in tok_dict["regular_tokens"]
|
218
|
-
}
|
219
|
-
special_tokens = {
|
220
|
-
bytes(item["bytes"]).decode(): item["token"]
|
221
|
-
for item in tok_dict["special_tokens"]
|
222
|
-
}
|
223
|
-
assert tok_dict["word_split"] == "V1"
|
224
|
-
|
225
|
-
default_allowed_special = None
|
226
|
-
|
227
|
-
kwargs = {
|
228
|
-
"name": name,
|
229
|
-
"pat_str": tok_dict.get("pat_str", PAT_STR_B),
|
230
|
-
"mergeable_ranks": mergeable_ranks,
|
231
|
-
"special_tokens": special_tokens,
|
232
|
-
}
|
233
|
-
if "default_allowed_special" in tok_dict:
|
234
|
-
default_allowed_special = set(
|
235
|
-
[
|
236
|
-
bytes(bytes_list).decode()
|
237
|
-
for bytes_list in tok_dict["default_allowed_special"]
|
238
|
-
]
|
239
|
-
)
|
240
|
-
if "vocab_size" in tok_dict:
|
241
|
-
kwargs["explicit_n_vocab"] = tok_dict["vocab_size"]
|
242
|
-
|
243
|
-
PAD = "<|pad|>"
|
244
|
-
EOS = "<|eos|>"
|
245
|
-
SEP = "<|separator|>"
|
246
|
-
|
247
|
-
DEFAULT_CONTROL_TOKENS = {"pad": PAD, "sep": EOS, "eos": SEP}
|
248
|
-
|
249
|
-
tokenizer = tiktoken.Encoding(**kwargs)
|
250
|
-
tokenizer._default_allowed_special = default_allowed_special or set()
|
251
|
-
tokenizer._control_tokens = DEFAULT_CONTROL_TOKENS
|
252
|
-
|
253
|
-
def encode_patched(
|
254
|
-
self,
|
255
|
-
text: str,
|
256
|
-
*,
|
257
|
-
allowed_special: Union[
|
258
|
-
Literal["all"], AbstractSet[str]
|
259
|
-
] = set(), # noqa: B006
|
260
|
-
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
|
261
|
-
) -> List[int]:
|
262
|
-
if isinstance(allowed_special, set):
|
263
|
-
allowed_special |= self._default_allowed_special
|
264
|
-
return tiktoken.Encoding.encode(
|
265
|
-
self,
|
266
|
-
text,
|
267
|
-
allowed_special=allowed_special,
|
268
|
-
disallowed_special=(),
|
269
|
-
)
|
270
|
-
|
271
|
-
tokenizer.encode = functools.partial(encode_patched, tokenizer)
|
272
|
-
|
273
|
-
# Convert to HF interface
|
274
|
-
self.tokenizer = tokenizer
|
275
|
-
self.eos_token_id = tokenizer._special_tokens[EOS]
|
276
|
-
self.vocab_size = tokenizer.n_vocab
|
277
|
-
self.chat_template = Template(
|
278
|
-
"{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
|
279
|
-
)
|
280
|
-
|
281
|
-
def encode(self, x, add_special_tokens=False):
|
282
|
-
return self.tokenizer.encode(x)
|
283
|
-
|
284
|
-
def decode(self, x):
|
285
|
-
return self.tokenizer.decode(x)
|
286
|
-
|
287
|
-
def batch_decode(
|
288
|
-
self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
|
289
|
-
):
|
290
|
-
if isinstance(batch[0], int):
|
291
|
-
batch = [[x] for x in batch]
|
292
|
-
return self.tokenizer.decode_batch(batch)
|
293
|
-
|
294
|
-
def apply_chat_template(self, messages, tokenize, add_generation_prompt):
|
295
|
-
ret = self.chat_template.render(
|
296
|
-
messages=messages, add_generation_prompt=add_generation_prompt
|
297
|
-
)
|
298
|
-
return self.encode(ret) if tokenize else ret
|
299
|
-
|
300
|
-
|
301
|
-
class SentencePieceTokenizer:
|
302
|
-
def __init__(self, tokenizer_path):
|
303
|
-
import sentencepiece as spm
|
304
|
-
from jinja2 import Template
|
305
|
-
|
306
|
-
tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path)
|
307
|
-
|
308
|
-
# Convert to HF interface
|
309
|
-
self.tokenizer = tokenizer
|
310
|
-
self.eos_token_id = tokenizer.eos_id()
|
311
|
-
self.vocab_size = tokenizer.vocab_size()
|
312
|
-
self.chat_template = Template(
|
313
|
-
"{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
|
314
|
-
)
|
315
|
-
|
316
|
-
def encode(self, x, add_special_tokens=False):
|
317
|
-
return self.tokenizer.encode(x)
|
318
|
-
|
319
|
-
def decode(self, x):
|
320
|
-
return self.tokenizer.decode(x)
|
321
|
-
|
322
|
-
def batch_decode(
|
323
|
-
self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
|
324
|
-
):
|
325
|
-
if isinstance(batch[0], int):
|
326
|
-
batch = [[x] for x in batch]
|
327
|
-
return self.tokenizer.decode(batch)
|
328
|
-
|
329
|
-
def apply_chat_template(self, messages, tokenize, add_generation_prompt):
|
330
|
-
ret = self.chat_template.render(
|
331
|
-
messages=messages, add_generation_prompt=add_generation_prompt
|
332
|
-
)
|
333
|
-
return self.encode(ret) if tokenize else ret
|
sglang/srt/layers/activation.py
CHANGED
@@ -18,7 +18,7 @@ from typing import Optional
|
|
18
18
|
import torch
|
19
19
|
import torch.nn as nn
|
20
20
|
import torch.nn.functional as F
|
21
|
-
from flashinfer.activation import gelu_tanh_and_mul, silu_and_mul
|
21
|
+
from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
|
22
22
|
from vllm.distributed import (
|
23
23
|
divide,
|
24
24
|
get_tensor_model_parallel_rank,
|
@@ -43,18 +43,24 @@ class SiluAndMul(CustomOp):
|
|
43
43
|
|
44
44
|
|
45
45
|
class GeluAndMul(CustomOp):
|
46
|
-
def __init__(self,
|
46
|
+
def __init__(self, approximate="tanh"):
|
47
47
|
super().__init__()
|
48
|
+
self.approximate = approximate
|
48
49
|
|
49
50
|
def forward_native(self, x: torch.Tensor) -> torch.Tensor:
|
50
51
|
d = x.shape[-1] // 2
|
51
|
-
return F.gelu(x[..., :d], approximate=
|
52
|
+
return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
|
52
53
|
|
53
54
|
def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
|
54
55
|
d = x.shape[-1] // 2
|
55
56
|
output_shape = x.shape[:-1] + (d,)
|
56
57
|
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
|
57
|
-
|
58
|
+
if self.approximate == "tanh":
|
59
|
+
gelu_tanh_and_mul(x, out)
|
60
|
+
elif self.approximate == "none":
|
61
|
+
gelu_and_mul(x, out)
|
62
|
+
else:
|
63
|
+
raise RuntimeError("GeluAndMul only support tanh or none")
|
58
64
|
return out
|
59
65
|
|
60
66
|
|
sglang/srt/layers/layernorm.py
CHANGED
@@ -19,7 +19,12 @@ from typing import Optional, Tuple, Union
|
|
19
19
|
|
20
20
|
import torch
|
21
21
|
import torch.nn as nn
|
22
|
-
from flashinfer.norm import
|
22
|
+
from flashinfer.norm import (
|
23
|
+
fused_add_rmsnorm,
|
24
|
+
gemma_fused_add_rmsnorm,
|
25
|
+
gemma_rmsnorm,
|
26
|
+
rmsnorm,
|
27
|
+
)
|
23
28
|
from vllm.model_executor.custom_op import CustomOp
|
24
29
|
|
25
30
|
|
@@ -63,3 +68,44 @@ class RMSNorm(CustomOp):
|
|
63
68
|
return x
|
64
69
|
else:
|
65
70
|
return x, residual
|
71
|
+
|
72
|
+
|
73
|
+
class GemmaRMSNorm(CustomOp):
|
74
|
+
def __init__(
|
75
|
+
self,
|
76
|
+
hidden_size: int,
|
77
|
+
eps: float = 1e-6,
|
78
|
+
) -> None:
|
79
|
+
super().__init__()
|
80
|
+
self.weight = nn.Parameter(torch.zeros(hidden_size))
|
81
|
+
self.variance_epsilon = eps
|
82
|
+
|
83
|
+
def forward_native(
|
84
|
+
self,
|
85
|
+
x: torch.Tensor,
|
86
|
+
residual: Optional[torch.Tensor] = None,
|
87
|
+
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
88
|
+
orig_dtype = x.dtype
|
89
|
+
if residual is not None:
|
90
|
+
x = x + residual
|
91
|
+
residual = x
|
92
|
+
|
93
|
+
x = x.float()
|
94
|
+
variance = x.pow(2).mean(dim=-1, keepdim=True)
|
95
|
+
x = x * torch.rsqrt(variance + self.variance_epsilon)
|
96
|
+
x = x * (1.0 + self.weight.float())
|
97
|
+
x = x.to(orig_dtype)
|
98
|
+
return x if residual is None else (x, residual)
|
99
|
+
|
100
|
+
def forward_cuda(
|
101
|
+
self,
|
102
|
+
x: torch.Tensor,
|
103
|
+
residual: Optional[torch.Tensor] = None,
|
104
|
+
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
105
|
+
if residual is not None:
|
106
|
+
gemma_fused_add_rmsnorm(
|
107
|
+
x, residual, self.weight.data, self.variance_epsilon
|
108
|
+
)
|
109
|
+
return x, residual
|
110
|
+
out = gemma_rmsnorm(x, self.weight.data, self.variance_epsilon)
|
111
|
+
return out
|
sglang/srt/managers/io_struct.py
CHANGED
@@ -55,6 +55,7 @@ class GenerateReqInput:
|
|
55
55
|
self.text is not None and self.input_ids is not None
|
56
56
|
):
|
57
57
|
raise ValueError("Either text or input_ids should be provided.")
|
58
|
+
|
58
59
|
if (
|
59
60
|
isinstance(self.sampling_params, dict)
|
60
61
|
and self.sampling_params.get("n", 1) != 1
|
@@ -161,10 +162,10 @@ class TokenizedGenerateReqInput:
|
|
161
162
|
input_ids: List[int]
|
162
163
|
# The pixel values for input images
|
163
164
|
pixel_values: List[float]
|
164
|
-
# The hash of input images
|
165
|
-
|
166
|
-
# The image
|
167
|
-
|
165
|
+
# The hash values of input images
|
166
|
+
image_hashes: List[int]
|
167
|
+
# The image sizes
|
168
|
+
image_sizes: List[List[int]]
|
168
169
|
# The sampling parameters
|
169
170
|
sampling_params: SamplingParams
|
170
171
|
# Whether to return the logprobs
|
@@ -121,8 +121,8 @@ class Req:
|
|
121
121
|
|
122
122
|
# For vision input
|
123
123
|
self.pixel_values = None
|
124
|
-
self.
|
125
|
-
self.
|
124
|
+
self.image_sizes = None
|
125
|
+
self.image_offsets = None
|
126
126
|
self.pad_value = None
|
127
127
|
|
128
128
|
# Prefix info
|
@@ -600,12 +600,12 @@ class ScheduleBatch:
|
|
600
600
|
if req.pixel_values is not None:
|
601
601
|
(
|
602
602
|
req.origin_input_ids,
|
603
|
-
req.
|
603
|
+
req.image_offsets,
|
604
604
|
) = model_runner.model.pad_input_ids(
|
605
605
|
req.origin_input_ids_unpadded,
|
606
606
|
req.pad_value,
|
607
|
-
req.pixel_values
|
608
|
-
req.
|
607
|
+
req.pixel_values,
|
608
|
+
req.image_sizes,
|
609
609
|
)
|
610
610
|
|
611
611
|
jump_forward_reqs.append(req)
|