sglang 0.4.3__py3-none-any.whl → 0.4.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1003 @@
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
3
+ # All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """Qwen2VL model configuration"""
17
+ from typing import Dict, Iterable, List, Optional, Union
18
+
19
+ import numpy as np
20
+ from transformers import (
21
+ AutoImageProcessor,
22
+ AutoProcessor,
23
+ BaseImageProcessor,
24
+ BatchFeature,
25
+ PretrainedConfig,
26
+ ProcessorMixin,
27
+ TensorType,
28
+ )
29
+ from transformers.image_transforms import (
30
+ convert_to_rgb,
31
+ normalize,
32
+ rescale,
33
+ resize,
34
+ to_channel_dimension_format,
35
+ )
36
+ from transformers.image_utils import (
37
+ ChannelDimension,
38
+ ImageInput,
39
+ PILImageResampling,
40
+ VideoInput,
41
+ get_image_size,
42
+ infer_channel_dimension_format,
43
+ is_pil_image,
44
+ is_valid_image,
45
+ make_list_of_images,
46
+ to_numpy_array,
47
+ valid_images,
48
+ validate_preprocess_arguments,
49
+ )
50
+ from transformers.modeling_rope_utils import rope_config_validation
51
+ from transformers.models.mllama.image_processing_mllama import is_valid_list_of_images
52
+ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
53
+ from transformers.processing_utils import ProcessingKwargs, Unpack, VideosKwargs
54
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
55
+ from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
56
+
57
+
58
+ class Qwen2_5_VLVisionConfig(PretrainedConfig):
59
+ model_type = "qwen2_5_vl"
60
+ base_config_key = "vision_config"
61
+
62
+ def __init__(
63
+ self,
64
+ depth=32,
65
+ hidden_size=3584,
66
+ hidden_act="silu",
67
+ intermediate_size=3420,
68
+ num_heads=16,
69
+ in_channels=3,
70
+ patch_size=14,
71
+ spatial_merge_size=2,
72
+ temporal_patch_size=2,
73
+ tokens_per_second=4,
74
+ window_size=112,
75
+ out_hidden_size=3584,
76
+ fullatt_block_indexes=[7, 15, 23, 31],
77
+ **kwargs,
78
+ ):
79
+ super().__init__(**kwargs)
80
+
81
+ self.depth = depth
82
+ self.hidden_size = hidden_size
83
+ self.hidden_act = hidden_act
84
+ self.intermediate_size = intermediate_size
85
+ self.num_heads = num_heads
86
+ self.in_channels = in_channels
87
+ self.patch_size = patch_size
88
+ self.spatial_merge_size = spatial_merge_size
89
+ self.temporal_patch_size = temporal_patch_size
90
+ self.tokens_per_second = tokens_per_second
91
+ self.window_size = window_size
92
+ self.fullatt_block_indexes = fullatt_block_indexes
93
+ self.out_hidden_size = out_hidden_size
94
+
95
+
96
+ class Qwen2_5_VLConfig(PretrainedConfig):
97
+ r"""
98
+ This is the configuration class to store the configuration of a [`Qwen2_5_VLModel`]. It is used to instantiate a
99
+ Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
100
+ with the defaults will yield a similar configuration to that of
101
+ Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
102
+
103
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
104
+ documentation from [`PretrainedConfig`] for more information.
105
+
106
+
107
+ Args:
108
+ vocab_size (`int`, *optional*, defaults to 152064):
109
+ Vocabulary size of the Qwen2_5_VL model. Defines the number of different tokens that can be represented by the
110
+ `inputs_ids` passed when calling [`Qwen2_5_VLModel`]
111
+ hidden_size (`int`, *optional*, defaults to 8192):
112
+ Dimension of the hidden representations.
113
+ intermediate_size (`int`, *optional*, defaults to 29568):
114
+ Dimension of the MLP representations.
115
+ num_hidden_layers (`int`, *optional*, defaults to 80):
116
+ Number of hidden layers in the Transformer encoder.
117
+ num_attention_heads (`int`, *optional*, defaults to 64):
118
+ Number of attention heads for each attention layer in the Transformer encoder.
119
+ num_key_value_heads (`int`, *optional*, defaults to 8):
120
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
121
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
122
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
123
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
124
+ by meanpooling all the original heads within that group. For more details checkout [this
125
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
126
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
127
+ The non-linear activation function (function or string) in the decoder.
128
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
129
+ The maximum sequence length that this model might ever be used with.
130
+ initializer_range (`float`, *optional*, defaults to 0.02):
131
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
132
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
133
+ The epsilon used by the rms normalization layers.
134
+ use_cache (`bool`, *optional*, defaults to `True`):
135
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
136
+ relevant if `config.is_decoder=True`.
137
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
138
+ Whether the model's input and output word embeddings should be tied.
139
+ rope_theta (`float`, *optional*, defaults to 1000000.0):
140
+ The base period of the RoPE embeddings.
141
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
142
+ Whether to use sliding window attention.
143
+ sliding_window (`int`, *optional*, defaults to 4096):
144
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
145
+ max_window_layers (`int`, *optional*, defaults to 80):
146
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
147
+ attention_dropout (`float`, *optional*, defaults to 0.0):
148
+ The dropout ratio for the attention probabilities.
149
+ vision_config (`Dict`, *optional*):
150
+ The config for the visual encoder initialization.
151
+ rope_scaling (`Dict`, *optional*):
152
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
153
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
154
+ accordingly.
155
+ Expected contents:
156
+ `rope_type` (`str`):
157
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
158
+ 'llama3'], with 'default' being the original RoPE implementation.
159
+ `factor` (`float`, *optional*):
160
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
161
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
162
+ original maximum pre-trained length.
163
+ `original_max_position_embeddings` (`int`, *optional*):
164
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
165
+ pretraining.
166
+ `attention_factor` (`float`, *optional*):
167
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
168
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
169
+ `factor` field to infer the suggested value.
170
+ `beta_fast` (`float`, *optional*):
171
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
172
+ ramp function. If unspecified, it defaults to 32.
173
+ `beta_slow` (`float`, *optional*):
174
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
175
+ ramp function. If unspecified, it defaults to 1.
176
+ `short_factor` (`List[float]`, *optional*):
177
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
178
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
179
+ size divided by the number of attention heads divided by 2
180
+ `long_factor` (`List[float]`, *optional*):
181
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
182
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
183
+ size divided by the number of attention heads divided by 2
184
+ `low_freq_factor` (`float`, *optional*):
185
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
186
+ `high_freq_factor` (`float`, *optional*):
187
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
188
+
189
+ ```python
190
+ >>> from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLConfig
191
+
192
+ >>> # Initializing a Qwen2_5_VL style configuration
193
+ >>> configuration = Qwen2_5_VLConfig()
194
+
195
+ >>> # Initializing a model from the Qwen2-VL-7B style configuration
196
+ >>> model = Qwen2_5_VLForConditionalGeneration(configuration)
197
+
198
+ >>> # Accessing the model configuration
199
+ >>> configuration = model.config
200
+ ```"""
201
+
202
+ model_type = "qwen2_5_vl"
203
+ sub_configs = {"vision_config": Qwen2_5_VLVisionConfig}
204
+ keys_to_ignore_at_inference = ["past_key_values"]
205
+ # Default tensor parallel plan for base model `Qwen2_5_VL`
206
+ base_model_tp_plan = {
207
+ "layers.*.self_attn.q_proj": "colwise",
208
+ "layers.*.self_attn.k_proj": "colwise",
209
+ "layers.*.self_attn.v_proj": "colwise",
210
+ "layers.*.self_attn.o_proj": "rowwise",
211
+ "layers.*.mlp.gate_proj": "colwise",
212
+ "layers.*.mlp.up_proj": "colwise",
213
+ "layers.*.mlp.down_proj": "rowwise",
214
+ }
215
+
216
+ def __init__(
217
+ self,
218
+ vocab_size=152064,
219
+ hidden_size=8192,
220
+ intermediate_size=29568,
221
+ num_hidden_layers=80,
222
+ num_attention_heads=64,
223
+ num_key_value_heads=8,
224
+ hidden_act="silu",
225
+ max_position_embeddings=32768,
226
+ initializer_range=0.02,
227
+ rms_norm_eps=1e-05,
228
+ use_cache=True,
229
+ tie_word_embeddings=False,
230
+ rope_theta=1000000.0,
231
+ use_sliding_window=False,
232
+ sliding_window=4096,
233
+ max_window_layers=80,
234
+ attention_dropout=0.0,
235
+ vision_config=None,
236
+ rope_scaling=None,
237
+ **kwargs,
238
+ ):
239
+ if isinstance(vision_config, dict):
240
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
241
+ elif vision_config is None:
242
+ self.vision_config = self.sub_configs["vision_config"]()
243
+
244
+ self.vocab_size = vocab_size
245
+ self.max_position_embeddings = max_position_embeddings
246
+ self.hidden_size = hidden_size
247
+ self.intermediate_size = intermediate_size
248
+ self.num_hidden_layers = num_hidden_layers
249
+ self.num_attention_heads = num_attention_heads
250
+ self.use_sliding_window = use_sliding_window
251
+ self.sliding_window = sliding_window
252
+ self.max_window_layers = max_window_layers
253
+
254
+ # for backward compatibility
255
+ if num_key_value_heads is None:
256
+ num_key_value_heads = num_attention_heads
257
+
258
+ self.num_key_value_heads = num_key_value_heads
259
+ self.hidden_act = hidden_act
260
+ self.initializer_range = initializer_range
261
+ self.rms_norm_eps = rms_norm_eps
262
+ self.use_cache = use_cache
263
+ self.rope_theta = rope_theta
264
+ self.attention_dropout = attention_dropout
265
+ self.rope_scaling = rope_scaling
266
+
267
+ # Validate the correctness of rotary position embeddings parameters
268
+ # BC: if there is a 'type' field, move it to 'rope_type'.
269
+ # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
270
+ # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
271
+ # TODO: @raushan update config in the hub
272
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
273
+ if self.rope_scaling["type"] == "mrope":
274
+ self.rope_scaling["type"] = "default"
275
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
276
+ rope_config_validation(self, ignore_keys={"mrope_section"})
277
+
278
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
279
+
280
+
281
+ # FIXME: workaround of obsolete transformers version
282
+
283
+
284
+ class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
285
+ fps: Union[List[float], float]
286
+
287
+
288
+ class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
289
+ videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
290
+ _defaults = {
291
+ "text_kwargs": {
292
+ "padding": False,
293
+ },
294
+ "videos_kwargs": {"fps": 2.0},
295
+ }
296
+
297
+
298
+ class Qwen2_5_VLProcessor(ProcessorMixin):
299
+ r"""
300
+ Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
301
+ [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
302
+ [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
303
+ Args:
304
+ image_processor ([`Qwen2VLImageProcessor`], *optional*):
305
+ The image processor is a required input.
306
+ tokenizer ([`Qwen2TokenizerFast`], *optional*):
307
+ The tokenizer is a required input.
308
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
309
+ in a chat into a tokenizable string.
310
+ """
311
+
312
+ attributes = ["image_processor", "tokenizer"]
313
+ valid_kwargs = ["chat_template"]
314
+
315
+ image_processor_class = "AutoImageProcessor"
316
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
317
+
318
+ def __init__(
319
+ self, image_processor=None, tokenizer=None, chat_template=None, **kwargs
320
+ ):
321
+ self.image_token = (
322
+ "<|image_pad|>"
323
+ if not hasattr(tokenizer, "image_token")
324
+ else tokenizer.image_token
325
+ )
326
+ self.video_token = (
327
+ "<|video_pad|>"
328
+ if not hasattr(tokenizer, "video_token")
329
+ else tokenizer.video_token
330
+ )
331
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
332
+
333
+ def __call__(
334
+ self,
335
+ images: ImageInput = None,
336
+ text: Union[
337
+ TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
338
+ ] = None,
339
+ videos: VideoInput = None,
340
+ **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
341
+ ) -> BatchFeature:
342
+ """
343
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
344
+ and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
345
+ the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
346
+ Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
347
+
348
+ Args:
349
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
350
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
351
+ tensor. Both channels-first and channels-last formats are supported.
352
+ text (`str`, `List[str]`, `List[List[str]]`):
353
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
354
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
355
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
356
+ videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
357
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
358
+ tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
359
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
360
+ If set, will return tensors of a particular framework. Acceptable values are:
361
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
362
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
363
+ - `'np'`: Return NumPy `np.ndarray` objects.
364
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
365
+
366
+ Returns:
367
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
368
+
369
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
370
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
371
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
372
+ `None`).
373
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
374
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
375
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
376
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
377
+ - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
378
+ """
379
+ output_kwargs = self._merge_kwargs(
380
+ Qwen2_5_VLProcessorKwargs,
381
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
382
+ **kwargs,
383
+ )
384
+ if images is not None:
385
+ image_inputs = self.image_processor(
386
+ images=images, videos=None, **output_kwargs["images_kwargs"]
387
+ )
388
+ image_grid_thw = image_inputs["image_grid_thw"]
389
+ else:
390
+ image_inputs = {}
391
+ image_grid_thw = None
392
+
393
+ if videos is not None:
394
+ videos_inputs = self.image_processor(
395
+ images=None, videos=videos, **output_kwargs["images_kwargs"]
396
+ )
397
+ video_grid_thw = videos_inputs["video_grid_thw"]
398
+
399
+ fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
400
+ if isinstance(fps, (int, float)):
401
+ second_per_grid_ts = [
402
+ self.image_processor.temporal_patch_size / fps
403
+ ] * len(video_grid_thw)
404
+ elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
405
+ second_per_grid_ts = [
406
+ self.image_processor.temporal_patch_size / tmp for tmp in fps
407
+ ]
408
+ else:
409
+ raise ValueError(
410
+ f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
411
+ )
412
+ videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
413
+
414
+ else:
415
+ videos_inputs = {}
416
+ video_grid_thw = None
417
+
418
+ if not isinstance(text, list):
419
+ text = [text]
420
+
421
+ if image_grid_thw is not None:
422
+ merge_length = self.image_processor.merge_size**2
423
+ index = 0
424
+ for i in range(len(text)):
425
+ while self.image_token in text[i]:
426
+ text[i] = text[i].replace(
427
+ self.image_token,
428
+ "<|placeholder|>"
429
+ * (image_grid_thw[index].prod() // merge_length),
430
+ 1,
431
+ )
432
+ index += 1
433
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
434
+
435
+ if video_grid_thw is not None:
436
+ merge_length = self.image_processor.merge_size**2
437
+ index = 0
438
+ for i in range(len(text)):
439
+ while self.video_token in text[i]:
440
+ text[i] = text[i].replace(
441
+ self.video_token,
442
+ "<|placeholder|>"
443
+ * (video_grid_thw[index].prod() // merge_length),
444
+ 1,
445
+ )
446
+ index += 1
447
+ text[i] = text[i].replace("<|placeholder|>", self.video_token)
448
+
449
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
450
+
451
+ return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
452
+
453
+ def batch_decode(self, *args, **kwargs):
454
+ """
455
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
456
+ refer to the docstring of this method for more information.
457
+ """
458
+ return self.tokenizer.batch_decode(*args, **kwargs)
459
+
460
+ def decode(self, *args, **kwargs):
461
+ """
462
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
463
+ the docstring of this method for more information.
464
+ """
465
+ return self.tokenizer.decode(*args, **kwargs)
466
+
467
+ def post_process_image_text_to_text(self, generated_outputs):
468
+ """
469
+ Post-process the output of the model to decode the text.
470
+
471
+ Args:
472
+ generated_outputs (`torch.Tensor` or `np.ndarray`):
473
+ The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
474
+ or `(sequence_length,)`.
475
+
476
+ Returns:
477
+ `List[str]`: The decoded text.
478
+ """
479
+ return self.tokenizer.batch_decode(
480
+ generated_outputs,
481
+ skip_special_tokens=True,
482
+ clean_up_tokenization_spaces=False,
483
+ )
484
+
485
+ @property
486
+ def model_input_names(self):
487
+ tokenizer_input_names = self.tokenizer.model_input_names
488
+ image_processor_input_names = self.image_processor.model_input_names
489
+ names_from_processor = list(
490
+ dict.fromkeys(tokenizer_input_names + image_processor_input_names)
491
+ )
492
+ return names_from_processor + ["second_per_grid_ts"]
493
+
494
+
495
+ class Qwen2_5_VLImageProcessor(BaseImageProcessor):
496
+ r"""
497
+ Constructs a Qwen2.5-VL image processor that dynamically resizes images based on the original images.
498
+
499
+ Args:
500
+ do_resize (`bool`, *optional*, defaults to `True`):
501
+ Whether to resize the image's (height, width) dimensions.
502
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
503
+ Resampling filter to use when resizing the image.
504
+ do_rescale (`bool`, *optional*, defaults to `True`):
505
+ Whether to rescale the image by the specified scale `rescale_factor`.
506
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
507
+ Scale factor to use if rescaling the image.
508
+ do_normalize (`bool`, *optional*, defaults to `True`):
509
+ Whether to normalize the image.
510
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
511
+ Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
512
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
513
+ Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
514
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
515
+ Whether to convert the image to RGB.
516
+ min_pixels (`int`, *optional*, defaults to `56 * 56`):
517
+ The min pixels of the image to resize the image.
518
+ max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
519
+ The max pixels of the image to resize the image.
520
+ patch_size (`int`, *optional*, defaults to 14):
521
+ The spacial patch size of the vision encoder.
522
+ temporal_patch_size (`int`, *optional*, defaults to 2):
523
+ The temporal patch size of the vision encoder.
524
+ merge_size (`int`, *optional*, defaults to 2):
525
+ The merge size of the vision encoder to llm encoder.
526
+ """
527
+
528
+ model_input_names = [
529
+ "pixel_values",
530
+ "image_grid_thw",
531
+ "pixel_values_videos",
532
+ "video_grid_thw",
533
+ "second_per_grid_ts",
534
+ ]
535
+
536
+ def __init__(
537
+ self,
538
+ do_resize: bool = True,
539
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
540
+ do_rescale: bool = True,
541
+ rescale_factor: Union[int, float] = 1 / 255,
542
+ do_normalize: bool = True,
543
+ image_mean: Optional[Union[float, List[float]]] = None,
544
+ image_std: Optional[Union[float, List[float]]] = None,
545
+ do_convert_rgb: bool = True,
546
+ min_pixels: int = 56 * 56,
547
+ max_pixels: int = 28 * 28 * 1280,
548
+ patch_size: int = 14,
549
+ temporal_patch_size: int = 2,
550
+ merge_size: int = 2,
551
+ **kwargs,
552
+ ) -> None:
553
+ super().__init__(**kwargs)
554
+ self.do_resize = do_resize
555
+ self.resample = resample
556
+ self.do_rescale = do_rescale
557
+ self.rescale_factor = rescale_factor
558
+ self.do_normalize = do_normalize
559
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
560
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
561
+ self.min_pixels = min_pixels
562
+ self.max_pixels = max_pixels
563
+ self.patch_size = patch_size
564
+ self.temporal_patch_size = temporal_patch_size
565
+ self.merge_size = merge_size
566
+ self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
567
+ self.do_convert_rgb = do_convert_rgb
568
+
569
+ def rescale(
570
+ self,
571
+ image: np.ndarray,
572
+ scale: float,
573
+ data_format: Optional[Union[str, ChannelDimension]] = None,
574
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
575
+ **kwargs,
576
+ ) -> np.ndarray:
577
+ """
578
+ Rescale an image by a scale factor. image = image * scale.
579
+
580
+ Args:
581
+ image (`np.ndarray`):
582
+ Image to rescale.
583
+ scale (`float`):
584
+ The scaling factor to rescale pixel values by.
585
+ data_format (`str` or `ChannelDimension`, *optional*):
586
+ The channel dimension format for the output image. If unset, the channel dimension format of the input
587
+ image is used. Can be one of:
588
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
589
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
590
+ input_data_format (`ChannelDimension` or `str`, *optional*):
591
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
592
+ from the input image. Can be one of:
593
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
594
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
595
+
596
+ Returns:
597
+ `np.ndarray`: The rescaled image.
598
+ """
599
+ return rescale(
600
+ image,
601
+ scale=scale,
602
+ data_format=data_format,
603
+ input_data_format=input_data_format,
604
+ **kwargs,
605
+ )
606
+
607
+ def normalize(
608
+ self,
609
+ image: np.ndarray,
610
+ mean: Union[float, Iterable[float]],
611
+ std: Union[float, Iterable[float]],
612
+ data_format: Optional[Union[str, ChannelDimension]] = None,
613
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
614
+ **kwargs,
615
+ ) -> np.ndarray:
616
+ """
617
+ Normalize an image. image = (image - image_mean) / image_std.
618
+
619
+ Args:
620
+ image (`np.ndarray`):
621
+ Image to normalize.
622
+ mean (`float` or `Iterable[float]`):
623
+ Image mean to use for normalization.
624
+ std (`float` or `Iterable[float]`):
625
+ Image standard deviation to use for normalization.
626
+ data_format (`str` or `ChannelDimension`, *optional*):
627
+ The channel dimension format for the output image. If unset, the channel dimension format of the input
628
+ image is used. Can be one of:
629
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
630
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
631
+ input_data_format (`ChannelDimension` or `str`, *optional*):
632
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
633
+ from the input image. Can be one of:
634
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
635
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
636
+
637
+ Returns:
638
+ `np.ndarray`: The normalized image.
639
+ """
640
+ return normalize(
641
+ image,
642
+ mean=mean,
643
+ std=std,
644
+ data_format=data_format,
645
+ input_data_format=input_data_format,
646
+ **kwargs,
647
+ )
648
+
649
+ def _preprocess(
650
+ self,
651
+ images: Union[ImageInput, VideoInput],
652
+ do_resize: bool = None,
653
+ resample: PILImageResampling = None,
654
+ do_rescale: bool = None,
655
+ rescale_factor: float = None,
656
+ do_normalize: bool = None,
657
+ image_mean: Optional[Union[float, List[float]]] = None,
658
+ image_std: Optional[Union[float, List[float]]] = None,
659
+ do_convert_rgb: bool = None,
660
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
661
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
662
+ ):
663
+ """
664
+ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
665
+
666
+ Args:
667
+ images (`ImageInput`):
668
+ Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
669
+ vision_info (`List[Dict]`, *optional*):
670
+ Optional list of dictionaries containing additional information about vision inputs.
671
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
672
+ Whether to resize the image.
673
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
674
+ Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
675
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
676
+ Whether to rescale the image.
677
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
678
+ Scale factor to use if rescaling the image.
679
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
680
+ Whether to normalize the image.
681
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
682
+ Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
683
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
684
+ Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
685
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
686
+ Whether to convert the image to RGB.
687
+ data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
688
+ The channel dimension format for the output image. Can be one of:
689
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
690
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
691
+ - Unset: Use the channel dimension format of the input image.
692
+ input_data_format (`ChannelDimension` or `str`, *optional*):
693
+ The channel dimension format for the input image. Can be one of:
694
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
695
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
696
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
697
+ """
698
+ images = make_list_of_images(images)
699
+
700
+ if do_convert_rgb:
701
+ images = [convert_to_rgb(image) for image in images]
702
+
703
+ # All transformations expect numpy arrays.
704
+ images = [to_numpy_array(image) for image in images]
705
+
706
+ if input_data_format is None:
707
+ # We assume that all images have the same channel dimension format.
708
+ input_data_format = infer_channel_dimension_format(images[0])
709
+
710
+ height, width = get_image_size(images[0], channel_dim=input_data_format)
711
+ resized_height, resized_width = height, width
712
+ processed_images = []
713
+ for image in images:
714
+ if do_resize:
715
+ resized_height, resized_width = smart_resize(
716
+ height,
717
+ width,
718
+ factor=self.patch_size * self.merge_size,
719
+ min_pixels=self.min_pixels,
720
+ max_pixels=self.max_pixels,
721
+ )
722
+ image = resize(
723
+ image,
724
+ size=(resized_height, resized_width),
725
+ resample=resample,
726
+ input_data_format=input_data_format,
727
+ )
728
+
729
+ if do_rescale:
730
+ image = self.rescale(
731
+ image, scale=rescale_factor, input_data_format=input_data_format
732
+ )
733
+
734
+ if do_normalize:
735
+ image = self.normalize(
736
+ image=image,
737
+ mean=image_mean,
738
+ std=image_std,
739
+ input_data_format=input_data_format,
740
+ )
741
+
742
+ image = to_channel_dimension_format(
743
+ image, data_format, input_channel_dim=input_data_format
744
+ )
745
+ processed_images.append(image)
746
+
747
+ patches = np.array(processed_images)
748
+ if data_format == ChannelDimension.LAST:
749
+ patches = patches.transpose(0, 3, 1, 2)
750
+ if patches.shape[0] % self.temporal_patch_size != 0:
751
+ repeats = np.repeat(
752
+ patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0
753
+ )
754
+ patches = np.concatenate([patches, repeats], axis=0)
755
+ channel = patches.shape[1]
756
+ grid_t = patches.shape[0] // self.temporal_patch_size
757
+ grid_h, grid_w = (
758
+ resized_height // self.patch_size,
759
+ resized_width // self.patch_size,
760
+ )
761
+ patches = patches.reshape(
762
+ grid_t,
763
+ self.temporal_patch_size,
764
+ channel,
765
+ grid_h // self.merge_size,
766
+ self.merge_size,
767
+ self.patch_size,
768
+ grid_w // self.merge_size,
769
+ self.merge_size,
770
+ self.patch_size,
771
+ )
772
+ patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
773
+ flatten_patches = patches.reshape(
774
+ grid_t * grid_h * grid_w,
775
+ channel * self.temporal_patch_size * self.patch_size * self.patch_size,
776
+ )
777
+
778
+ return flatten_patches, (grid_t, grid_h, grid_w)
779
+
780
+ def preprocess(
781
+ self,
782
+ images: ImageInput,
783
+ videos: VideoInput = None,
784
+ do_resize: bool = None,
785
+ size: Dict[str, int] = None,
786
+ resample: PILImageResampling = None,
787
+ do_rescale: bool = None,
788
+ rescale_factor: float = None,
789
+ do_normalize: bool = None,
790
+ image_mean: Optional[Union[float, List[float]]] = None,
791
+ image_std: Optional[Union[float, List[float]]] = None,
792
+ do_convert_rgb: bool = None,
793
+ return_tensors: Optional[Union[str, TensorType]] = None,
794
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
795
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
796
+ ):
797
+ """
798
+ Args:
799
+ images (`ImageInput`):
800
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
801
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
802
+ videos (`VideoInput`):
803
+ Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
804
+ passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
805
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
806
+ Whether to resize the image.
807
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
808
+ Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
809
+ the longest edge resized to keep the input aspect ratio.
810
+ resample (`int`, *optional*, defaults to `self.resample`):
811
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
812
+ has an effect if `do_resize` is set to `True`.
813
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
814
+ Whether to rescale the image.
815
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
816
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
817
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
818
+ Whether to normalize the image.
819
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
820
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
821
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
822
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
823
+ `True`.
824
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
825
+ Whether to convert the image to RGB.
826
+ return_tensors (`str` or `TensorType`, *optional*):
827
+ The type of tensors to return. Can be one of:
828
+ - Unset: Return a list of `np.ndarray`.
829
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
830
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
831
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
832
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
833
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
834
+ The channel dimension format for the output image. Can be one of:
835
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
836
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
837
+ - Unset: Use the channel dimension format of the input image.
838
+ input_data_format (`ChannelDimension` or `str`, *optional*):
839
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
840
+ from the input image. Can be one of:
841
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
842
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
843
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
844
+
845
+ """
846
+ do_resize = do_resize if do_resize is not None else self.do_resize
847
+ size = size if size is not None else self.size
848
+ resample = resample if resample is not None else self.resample
849
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
850
+ rescale_factor = (
851
+ rescale_factor if rescale_factor is not None else self.rescale_factor
852
+ )
853
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
854
+ image_mean = image_mean if image_mean is not None else self.image_mean
855
+ image_std = image_std if image_std is not None else self.image_std
856
+ do_convert_rgb = (
857
+ do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
858
+ )
859
+
860
+ def make_flat_list_of_images(
861
+ images: Union[List[ImageInput], ImageInput],
862
+ ) -> ImageInput:
863
+ """
864
+ Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1.
865
+ If the input is a nested list of images, it is converted to a flat list of images.
866
+ Args:
867
+ images (`Union[List[ImageInput], ImageInput]`):
868
+ The input image.
869
+ Returns:
870
+ list: A list of images or a 4d array of images.
871
+ """
872
+ # If the input is a nested list of images, we flatten it
873
+ if (
874
+ isinstance(images, (list, tuple))
875
+ and all(isinstance(images_i, (list, tuple)) for images_i in images)
876
+ and all(is_valid_list_of_images(images_i) for images_i in images)
877
+ ):
878
+ return [img for img_list in images for img in img_list]
879
+
880
+ if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
881
+ if is_pil_image(images[0]) or images[0].ndim == 3:
882
+ return images
883
+ if images[0].ndim == 4:
884
+ return [img for img_list in images for img in img_list]
885
+
886
+ if is_valid_image(images):
887
+ if is_pil_image(images) or images.ndim == 3:
888
+ return [images]
889
+ if images.ndim == 4:
890
+ return list(images)
891
+
892
+ raise ValueError(f"Could not make a flat list of images from {images}")
893
+
894
+ def make_batched_videos(videos) -> VideoInput:
895
+ """
896
+ Ensure that the input is a list of videos.
897
+ Args:
898
+ videos (`VideoInput`):
899
+ Video or videos to turn into a list of videos.
900
+ Returns:
901
+ list: A list of videos.
902
+ """
903
+ if (
904
+ isinstance(videos, (list, tuple))
905
+ and isinstance(videos[0], (list, tuple))
906
+ and is_valid_image(videos[0][0])
907
+ ):
908
+ # case 1: nested batch of videos so we flatten it
909
+ if not is_pil_image(videos[0][0]) and videos[0][0].ndim == 4:
910
+ videos = [
911
+ [video for batch_list in batched_videos for video in batch_list]
912
+ for batched_videos in videos
913
+ ]
914
+ # case 2: list of videos represented as list of video frames
915
+ return videos
916
+
917
+ elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
918
+ if is_pil_image(videos[0]) or videos[0].ndim == 3:
919
+ return [videos]
920
+ elif videos[0].ndim == 4:
921
+ return [list(video) for video in videos]
922
+
923
+ elif is_valid_image(videos):
924
+ if is_pil_image(videos) or videos.ndim == 3:
925
+ return [[videos]]
926
+ elif videos.ndim == 4:
927
+ return [list(videos)]
928
+
929
+ raise ValueError(f"Could not make batched video from {videos}")
930
+
931
+ if images is not None:
932
+ images = make_flat_list_of_images(images)
933
+ if videos is not None:
934
+ videos = make_batched_videos(videos)
935
+
936
+ if images is not None and not valid_images(images):
937
+ raise ValueError(
938
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
939
+ "torch.Tensor, tf.Tensor or jax.ndarray."
940
+ )
941
+
942
+ validate_preprocess_arguments(
943
+ rescale_factor=rescale_factor,
944
+ do_normalize=do_normalize,
945
+ image_mean=image_mean,
946
+ image_std=image_std,
947
+ do_resize=do_resize,
948
+ size=size,
949
+ resample=resample,
950
+ )
951
+
952
+ if images is not None:
953
+ pixel_values, vision_grid_thws = [], []
954
+ for image in images:
955
+ patches, image_grid_thw = self._preprocess(
956
+ image,
957
+ do_resize=do_resize,
958
+ resample=resample,
959
+ do_rescale=do_rescale,
960
+ rescale_factor=rescale_factor,
961
+ do_normalize=do_normalize,
962
+ image_mean=image_mean,
963
+ image_std=image_std,
964
+ data_format=data_format,
965
+ do_convert_rgb=do_convert_rgb,
966
+ input_data_format=input_data_format,
967
+ )
968
+ pixel_values.extend(patches)
969
+ vision_grid_thws.append(image_grid_thw)
970
+ pixel_values = np.array(pixel_values)
971
+ vision_grid_thws = np.array(vision_grid_thws)
972
+ data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
973
+
974
+ if videos is not None:
975
+ pixel_values, vision_grid_thws = [], []
976
+ for images in videos:
977
+ patches, video_grid_thw = self._preprocess(
978
+ images,
979
+ do_resize=do_resize,
980
+ resample=resample,
981
+ do_rescale=do_rescale,
982
+ rescale_factor=rescale_factor,
983
+ do_normalize=do_normalize,
984
+ image_mean=image_mean,
985
+ image_std=image_std,
986
+ data_format=data_format,
987
+ do_convert_rgb=do_convert_rgb,
988
+ input_data_format=input_data_format,
989
+ )
990
+ pixel_values.extend(patches)
991
+ vision_grid_thws.append(video_grid_thw)
992
+ pixel_values = np.array(pixel_values)
993
+ vision_grid_thws = np.array(vision_grid_thws)
994
+ data = {
995
+ "pixel_values_videos": pixel_values,
996
+ "video_grid_thw": vision_grid_thws,
997
+ }
998
+
999
+ return BatchFeature(data=data, tensor_type=return_tensors)
1000
+
1001
+
1002
+ AutoImageProcessor.register(Qwen2_5_VLConfig, Qwen2_5_VLImageProcessor)
1003
+ AutoProcessor.register(Qwen2_5_VLConfig, Qwen2_5_VLProcessor)