sglang 0.2.14.post1__py3-none-any.whl → 0.2.14.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ """Launch the inference server for Llava-video model."""
2
+
3
+ import argparse
4
+
5
+ from sglang.srt.server import ServerArgs, launch_server
6
+
7
+ if __name__ == "__main__":
8
+ parser = argparse.ArgumentParser()
9
+ ServerArgs.add_cli_args(parser)
10
+ args = parser.parse_args()
11
+ server_args = ServerArgs.from_cli_args(args)
12
+
13
+ model_overide_args = {}
14
+ model_overide_args["mm_spatial_pool_stride"] = 2
15
+ model_overide_args["architectures"] = ["LlavaVidForCausalLM"]
16
+ model_overide_args["num_frames"] = 16
17
+ model_overide_args["model_type"] = "llavavid"
18
+ if model_overide_args["num_frames"] == 32:
19
+ model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
20
+ model_overide_args["max_sequence_length"] = 4096 * 2
21
+ model_overide_args["tokenizer_model_max_length"] = 4096 * 2
22
+ model_overide_args["model_max_length"] = 4096 * 2
23
+ if "34b" in args.model_path.lower():
24
+ model_overide_args["image_token_index"] = 64002
25
+
26
+ launch_server(server_args, model_overide_args, None)
@@ -119,24 +119,7 @@ def get_tokenizer(
119
119
  tokenizer_revision: Optional[str] = None,
120
120
  **kwargs,
121
121
  ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
122
- if tokenizer_name.endswith(".json"):
123
- return TiktokenTokenizer(tokenizer_name)
124
-
125
- if tokenizer_name.endswith(".model"):
126
- return SentencePieceTokenizer(tokenizer_name)
127
-
128
122
  """Gets a tokenizer for the given model name via Huggingface."""
129
- if is_multimodal_model(tokenizer_name):
130
- processor = get_processor(
131
- tokenizer_name,
132
- *args,
133
- trust_remote_code=trust_remote_code,
134
- tokenizer_revision=tokenizer_revision,
135
- **kwargs,
136
- )
137
- tokenizer = processor.tokenizer
138
- return tokenizer
139
-
140
123
  if tokenizer_mode == "slow":
141
124
  if kwargs.get("use_fast", False):
142
125
  raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
@@ -199,135 +182,3 @@ def get_processor(
199
182
  **kwargs,
200
183
  )
201
184
  return processor
202
-
203
-
204
- class TiktokenTokenizer:
205
- def __init__(self, tokenizer_path):
206
- import tiktoken
207
- from jinja2 import Template
208
-
209
- PAT_STR_B = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
210
-
211
- # Read JSON
212
- name = "tmp-json"
213
- with open(tokenizer_path, "rb") as fin:
214
- tok_dict = json.load(fin)
215
-
216
- mergeable_ranks = {
217
- bytes(item["bytes"]): item["token"] for item in tok_dict["regular_tokens"]
218
- }
219
- special_tokens = {
220
- bytes(item["bytes"]).decode(): item["token"]
221
- for item in tok_dict["special_tokens"]
222
- }
223
- assert tok_dict["word_split"] == "V1"
224
-
225
- default_allowed_special = None
226
-
227
- kwargs = {
228
- "name": name,
229
- "pat_str": tok_dict.get("pat_str", PAT_STR_B),
230
- "mergeable_ranks": mergeable_ranks,
231
- "special_tokens": special_tokens,
232
- }
233
- if "default_allowed_special" in tok_dict:
234
- default_allowed_special = set(
235
- [
236
- bytes(bytes_list).decode()
237
- for bytes_list in tok_dict["default_allowed_special"]
238
- ]
239
- )
240
- if "vocab_size" in tok_dict:
241
- kwargs["explicit_n_vocab"] = tok_dict["vocab_size"]
242
-
243
- PAD = "<|pad|>"
244
- EOS = "<|eos|>"
245
- SEP = "<|separator|>"
246
-
247
- DEFAULT_CONTROL_TOKENS = {"pad": PAD, "sep": EOS, "eos": SEP}
248
-
249
- tokenizer = tiktoken.Encoding(**kwargs)
250
- tokenizer._default_allowed_special = default_allowed_special or set()
251
- tokenizer._control_tokens = DEFAULT_CONTROL_TOKENS
252
-
253
- def encode_patched(
254
- self,
255
- text: str,
256
- *,
257
- allowed_special: Union[
258
- Literal["all"], AbstractSet[str]
259
- ] = set(), # noqa: B006
260
- disallowed_special: Union[Literal["all"], Collection[str]] = "all",
261
- ) -> List[int]:
262
- if isinstance(allowed_special, set):
263
- allowed_special |= self._default_allowed_special
264
- return tiktoken.Encoding.encode(
265
- self,
266
- text,
267
- allowed_special=allowed_special,
268
- disallowed_special=(),
269
- )
270
-
271
- tokenizer.encode = functools.partial(encode_patched, tokenizer)
272
-
273
- # Convert to HF interface
274
- self.tokenizer = tokenizer
275
- self.eos_token_id = tokenizer._special_tokens[EOS]
276
- self.vocab_size = tokenizer.n_vocab
277
- self.chat_template = Template(
278
- "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
279
- )
280
-
281
- def encode(self, x, add_special_tokens=False):
282
- return self.tokenizer.encode(x)
283
-
284
- def decode(self, x):
285
- return self.tokenizer.decode(x)
286
-
287
- def batch_decode(
288
- self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
289
- ):
290
- if isinstance(batch[0], int):
291
- batch = [[x] for x in batch]
292
- return self.tokenizer.decode_batch(batch)
293
-
294
- def apply_chat_template(self, messages, tokenize, add_generation_prompt):
295
- ret = self.chat_template.render(
296
- messages=messages, add_generation_prompt=add_generation_prompt
297
- )
298
- return self.encode(ret) if tokenize else ret
299
-
300
-
301
- class SentencePieceTokenizer:
302
- def __init__(self, tokenizer_path):
303
- import sentencepiece as spm
304
- from jinja2 import Template
305
-
306
- tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path)
307
-
308
- # Convert to HF interface
309
- self.tokenizer = tokenizer
310
- self.eos_token_id = tokenizer.eos_id()
311
- self.vocab_size = tokenizer.vocab_size()
312
- self.chat_template = Template(
313
- "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
314
- )
315
-
316
- def encode(self, x, add_special_tokens=False):
317
- return self.tokenizer.encode(x)
318
-
319
- def decode(self, x):
320
- return self.tokenizer.decode(x)
321
-
322
- def batch_decode(
323
- self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
324
- ):
325
- if isinstance(batch[0], int):
326
- batch = [[x] for x in batch]
327
- return self.tokenizer.decode(batch)
328
-
329
- def apply_chat_template(self, messages, tokenize, add_generation_prompt):
330
- ret = self.chat_template.render(
331
- messages=messages, add_generation_prompt=add_generation_prompt
332
- )
333
- return self.encode(ret) if tokenize else ret
@@ -18,7 +18,7 @@ from typing import Optional
18
18
  import torch
19
19
  import torch.nn as nn
20
20
  import torch.nn.functional as F
21
- from flashinfer.activation import gelu_tanh_and_mul, silu_and_mul
21
+ from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
22
22
  from vllm.distributed import (
23
23
  divide,
24
24
  get_tensor_model_parallel_rank,
@@ -43,18 +43,24 @@ class SiluAndMul(CustomOp):
43
43
 
44
44
 
45
45
  class GeluAndMul(CustomOp):
46
- def __init__(self, **kwargs):
46
+ def __init__(self, approximate="tanh"):
47
47
  super().__init__()
48
+ self.approximate = approximate
48
49
 
49
50
  def forward_native(self, x: torch.Tensor) -> torch.Tensor:
50
51
  d = x.shape[-1] // 2
51
- return F.gelu(x[..., :d], approximate="tanh") * x[..., d:]
52
+ return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
52
53
 
53
54
  def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
54
55
  d = x.shape[-1] // 2
55
56
  output_shape = x.shape[:-1] + (d,)
56
57
  out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
57
- gelu_tanh_and_mul(x, out)
58
+ if self.approximate == "tanh":
59
+ gelu_tanh_and_mul(x, out)
60
+ elif self.approximate == "none":
61
+ gelu_and_mul(x, out)
62
+ else:
63
+ raise RuntimeError("GeluAndMul only support tanh or none")
58
64
  return out
59
65
 
60
66
 
@@ -19,7 +19,12 @@ from typing import Optional, Tuple, Union
19
19
 
20
20
  import torch
21
21
  import torch.nn as nn
22
- from flashinfer.norm import fused_add_rmsnorm, rmsnorm
22
+ from flashinfer.norm import (
23
+ fused_add_rmsnorm,
24
+ gemma_fused_add_rmsnorm,
25
+ gemma_rmsnorm,
26
+ rmsnorm,
27
+ )
23
28
  from vllm.model_executor.custom_op import CustomOp
24
29
 
25
30
 
@@ -63,3 +68,44 @@ class RMSNorm(CustomOp):
63
68
  return x
64
69
  else:
65
70
  return x, residual
71
+
72
+
73
+ class GemmaRMSNorm(CustomOp):
74
+ def __init__(
75
+ self,
76
+ hidden_size: int,
77
+ eps: float = 1e-6,
78
+ ) -> None:
79
+ super().__init__()
80
+ self.weight = nn.Parameter(torch.zeros(hidden_size))
81
+ self.variance_epsilon = eps
82
+
83
+ def forward_native(
84
+ self,
85
+ x: torch.Tensor,
86
+ residual: Optional[torch.Tensor] = None,
87
+ ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
88
+ orig_dtype = x.dtype
89
+ if residual is not None:
90
+ x = x + residual
91
+ residual = x
92
+
93
+ x = x.float()
94
+ variance = x.pow(2).mean(dim=-1, keepdim=True)
95
+ x = x * torch.rsqrt(variance + self.variance_epsilon)
96
+ x = x * (1.0 + self.weight.float())
97
+ x = x.to(orig_dtype)
98
+ return x if residual is None else (x, residual)
99
+
100
+ def forward_cuda(
101
+ self,
102
+ x: torch.Tensor,
103
+ residual: Optional[torch.Tensor] = None,
104
+ ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
105
+ if residual is not None:
106
+ gemma_fused_add_rmsnorm(
107
+ x, residual, self.weight.data, self.variance_epsilon
108
+ )
109
+ return x, residual
110
+ out = gemma_rmsnorm(x, self.weight.data, self.variance_epsilon)
111
+ return out
@@ -55,6 +55,7 @@ class GenerateReqInput:
55
55
  self.text is not None and self.input_ids is not None
56
56
  ):
57
57
  raise ValueError("Either text or input_ids should be provided.")
58
+
58
59
  if (
59
60
  isinstance(self.sampling_params, dict)
60
61
  and self.sampling_params.get("n", 1) != 1
@@ -161,10 +162,10 @@ class TokenizedGenerateReqInput:
161
162
  input_ids: List[int]
162
163
  # The pixel values for input images
163
164
  pixel_values: List[float]
164
- # The hash of input images
165
- image_hash: int
166
- # The image size
167
- image_size: List[int]
165
+ # The hash values of input images
166
+ image_hashes: List[int]
167
+ # The image sizes
168
+ image_sizes: List[List[int]]
168
169
  # The sampling parameters
169
170
  sampling_params: SamplingParams
170
171
  # Whether to return the logprobs
@@ -121,8 +121,8 @@ class Req:
121
121
 
122
122
  # For vision input
123
123
  self.pixel_values = None
124
- self.image_size = None
125
- self.image_offset = None
124
+ self.image_sizes = None
125
+ self.image_offsets = None
126
126
  self.pad_value = None
127
127
 
128
128
  # Prefix info
@@ -600,12 +600,12 @@ class ScheduleBatch:
600
600
  if req.pixel_values is not None:
601
601
  (
602
602
  req.origin_input_ids,
603
- req.image_offset,
603
+ req.image_offsets,
604
604
  ) = model_runner.model.pad_input_ids(
605
605
  req.origin_input_ids_unpadded,
606
606
  req.pad_value,
607
- req.pixel_values.shape,
608
- req.image_size,
607
+ req.pixel_values,
608
+ req.image_sizes,
609
609
  )
610
610
 
611
611
  jump_forward_reqs.append(req)