sglang 0.4.3__py3-none-any.whl → 0.4.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/lang/backend/openai.py +5 -0
- sglang/lang/chat_template.py +22 -7
- sglang/lang/ir.py +1 -0
- sglang/srt/configs/__init__.py +6 -3
- sglang/srt/configs/model_config.py +2 -0
- sglang/srt/configs/qwen2_5_vl_config.py +1003 -0
- sglang/srt/entrypoints/engine.py +16 -1
- sglang/srt/hf_transformers_utils.py +2 -3
- sglang/srt/managers/image_processor.py +217 -122
- sglang/srt/model_executor/forward_batch_info.py +4 -1
- sglang/srt/models/deepseek_nextn.py +295 -0
- sglang/srt/models/deepseek_v2.py +4 -1
- sglang/srt/models/llava.py +2 -1
- sglang/srt/models/qwen2_5_vl.py +722 -0
- sglang/srt/models/qwen2_vl.py +2 -1
- sglang/srt/openai_api/adapter.py +17 -3
- sglang/srt/server_args.py +6 -3
- sglang/srt/speculative/eagle_worker.py +7 -2
- sglang/srt/speculative/spec_info.py +11 -1
- sglang/utils.py +99 -19
- sglang/version.py +1 -1
- {sglang-0.4.3.dist-info → sglang-0.4.3.post1.dist-info}/METADATA +2 -2
- {sglang-0.4.3.dist-info → sglang-0.4.3.post1.dist-info}/RECORD +26 -24
- sglang/srt/configs/qwen2vl.py +0 -130
- {sglang-0.4.3.dist-info → sglang-0.4.3.post1.dist-info}/LICENSE +0 -0
- {sglang-0.4.3.dist-info → sglang-0.4.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.4.3.dist-info → sglang-0.4.3.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1003 @@
|
|
1
|
+
# coding=utf-8
|
2
|
+
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
|
3
|
+
# All rights reserved.
|
4
|
+
#
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
"""Qwen2VL model configuration"""
|
17
|
+
from typing import Dict, Iterable, List, Optional, Union
|
18
|
+
|
19
|
+
import numpy as np
|
20
|
+
from transformers import (
|
21
|
+
AutoImageProcessor,
|
22
|
+
AutoProcessor,
|
23
|
+
BaseImageProcessor,
|
24
|
+
BatchFeature,
|
25
|
+
PretrainedConfig,
|
26
|
+
ProcessorMixin,
|
27
|
+
TensorType,
|
28
|
+
)
|
29
|
+
from transformers.image_transforms import (
|
30
|
+
convert_to_rgb,
|
31
|
+
normalize,
|
32
|
+
rescale,
|
33
|
+
resize,
|
34
|
+
to_channel_dimension_format,
|
35
|
+
)
|
36
|
+
from transformers.image_utils import (
|
37
|
+
ChannelDimension,
|
38
|
+
ImageInput,
|
39
|
+
PILImageResampling,
|
40
|
+
VideoInput,
|
41
|
+
get_image_size,
|
42
|
+
infer_channel_dimension_format,
|
43
|
+
is_pil_image,
|
44
|
+
is_valid_image,
|
45
|
+
make_list_of_images,
|
46
|
+
to_numpy_array,
|
47
|
+
valid_images,
|
48
|
+
validate_preprocess_arguments,
|
49
|
+
)
|
50
|
+
from transformers.modeling_rope_utils import rope_config_validation
|
51
|
+
from transformers.models.mllama.image_processing_mllama import is_valid_list_of_images
|
52
|
+
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
|
53
|
+
from transformers.processing_utils import ProcessingKwargs, Unpack, VideosKwargs
|
54
|
+
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
55
|
+
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
56
|
+
|
57
|
+
|
58
|
+
class Qwen2_5_VLVisionConfig(PretrainedConfig):
|
59
|
+
model_type = "qwen2_5_vl"
|
60
|
+
base_config_key = "vision_config"
|
61
|
+
|
62
|
+
def __init__(
|
63
|
+
self,
|
64
|
+
depth=32,
|
65
|
+
hidden_size=3584,
|
66
|
+
hidden_act="silu",
|
67
|
+
intermediate_size=3420,
|
68
|
+
num_heads=16,
|
69
|
+
in_channels=3,
|
70
|
+
patch_size=14,
|
71
|
+
spatial_merge_size=2,
|
72
|
+
temporal_patch_size=2,
|
73
|
+
tokens_per_second=4,
|
74
|
+
window_size=112,
|
75
|
+
out_hidden_size=3584,
|
76
|
+
fullatt_block_indexes=[7, 15, 23, 31],
|
77
|
+
**kwargs,
|
78
|
+
):
|
79
|
+
super().__init__(**kwargs)
|
80
|
+
|
81
|
+
self.depth = depth
|
82
|
+
self.hidden_size = hidden_size
|
83
|
+
self.hidden_act = hidden_act
|
84
|
+
self.intermediate_size = intermediate_size
|
85
|
+
self.num_heads = num_heads
|
86
|
+
self.in_channels = in_channels
|
87
|
+
self.patch_size = patch_size
|
88
|
+
self.spatial_merge_size = spatial_merge_size
|
89
|
+
self.temporal_patch_size = temporal_patch_size
|
90
|
+
self.tokens_per_second = tokens_per_second
|
91
|
+
self.window_size = window_size
|
92
|
+
self.fullatt_block_indexes = fullatt_block_indexes
|
93
|
+
self.out_hidden_size = out_hidden_size
|
94
|
+
|
95
|
+
|
96
|
+
class Qwen2_5_VLConfig(PretrainedConfig):
|
97
|
+
r"""
|
98
|
+
This is the configuration class to store the configuration of a [`Qwen2_5_VLModel`]. It is used to instantiate a
|
99
|
+
Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
100
|
+
with the defaults will yield a similar configuration to that of
|
101
|
+
Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
|
102
|
+
|
103
|
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
104
|
+
documentation from [`PretrainedConfig`] for more information.
|
105
|
+
|
106
|
+
|
107
|
+
Args:
|
108
|
+
vocab_size (`int`, *optional*, defaults to 152064):
|
109
|
+
Vocabulary size of the Qwen2_5_VL model. Defines the number of different tokens that can be represented by the
|
110
|
+
`inputs_ids` passed when calling [`Qwen2_5_VLModel`]
|
111
|
+
hidden_size (`int`, *optional*, defaults to 8192):
|
112
|
+
Dimension of the hidden representations.
|
113
|
+
intermediate_size (`int`, *optional*, defaults to 29568):
|
114
|
+
Dimension of the MLP representations.
|
115
|
+
num_hidden_layers (`int`, *optional*, defaults to 80):
|
116
|
+
Number of hidden layers in the Transformer encoder.
|
117
|
+
num_attention_heads (`int`, *optional*, defaults to 64):
|
118
|
+
Number of attention heads for each attention layer in the Transformer encoder.
|
119
|
+
num_key_value_heads (`int`, *optional*, defaults to 8):
|
120
|
+
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
121
|
+
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
122
|
+
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
123
|
+
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
124
|
+
by meanpooling all the original heads within that group. For more details checkout [this
|
125
|
+
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
126
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
127
|
+
The non-linear activation function (function or string) in the decoder.
|
128
|
+
max_position_embeddings (`int`, *optional*, defaults to 32768):
|
129
|
+
The maximum sequence length that this model might ever be used with.
|
130
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
131
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
132
|
+
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
133
|
+
The epsilon used by the rms normalization layers.
|
134
|
+
use_cache (`bool`, *optional*, defaults to `True`):
|
135
|
+
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
136
|
+
relevant if `config.is_decoder=True`.
|
137
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
138
|
+
Whether the model's input and output word embeddings should be tied.
|
139
|
+
rope_theta (`float`, *optional*, defaults to 1000000.0):
|
140
|
+
The base period of the RoPE embeddings.
|
141
|
+
use_sliding_window (`bool`, *optional*, defaults to `False`):
|
142
|
+
Whether to use sliding window attention.
|
143
|
+
sliding_window (`int`, *optional*, defaults to 4096):
|
144
|
+
Sliding window attention (SWA) window size. If not specified, will default to `4096`.
|
145
|
+
max_window_layers (`int`, *optional*, defaults to 80):
|
146
|
+
The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
|
147
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
148
|
+
The dropout ratio for the attention probabilities.
|
149
|
+
vision_config (`Dict`, *optional*):
|
150
|
+
The config for the visual encoder initialization.
|
151
|
+
rope_scaling (`Dict`, *optional*):
|
152
|
+
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
153
|
+
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
154
|
+
accordingly.
|
155
|
+
Expected contents:
|
156
|
+
`rope_type` (`str`):
|
157
|
+
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
|
158
|
+
'llama3'], with 'default' being the original RoPE implementation.
|
159
|
+
`factor` (`float`, *optional*):
|
160
|
+
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
|
161
|
+
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
|
162
|
+
original maximum pre-trained length.
|
163
|
+
`original_max_position_embeddings` (`int`, *optional*):
|
164
|
+
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
|
165
|
+
pretraining.
|
166
|
+
`attention_factor` (`float`, *optional*):
|
167
|
+
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
|
168
|
+
computation. If unspecified, it defaults to value recommended by the implementation, using the
|
169
|
+
`factor` field to infer the suggested value.
|
170
|
+
`beta_fast` (`float`, *optional*):
|
171
|
+
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
|
172
|
+
ramp function. If unspecified, it defaults to 32.
|
173
|
+
`beta_slow` (`float`, *optional*):
|
174
|
+
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
|
175
|
+
ramp function. If unspecified, it defaults to 1.
|
176
|
+
`short_factor` (`List[float]`, *optional*):
|
177
|
+
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
|
178
|
+
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
179
|
+
size divided by the number of attention heads divided by 2
|
180
|
+
`long_factor` (`List[float]`, *optional*):
|
181
|
+
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
|
182
|
+
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
183
|
+
size divided by the number of attention heads divided by 2
|
184
|
+
`low_freq_factor` (`float`, *optional*):
|
185
|
+
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
|
186
|
+
`high_freq_factor` (`float`, *optional*):
|
187
|
+
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
|
188
|
+
|
189
|
+
```python
|
190
|
+
>>> from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLConfig
|
191
|
+
|
192
|
+
>>> # Initializing a Qwen2_5_VL style configuration
|
193
|
+
>>> configuration = Qwen2_5_VLConfig()
|
194
|
+
|
195
|
+
>>> # Initializing a model from the Qwen2-VL-7B style configuration
|
196
|
+
>>> model = Qwen2_5_VLForConditionalGeneration(configuration)
|
197
|
+
|
198
|
+
>>> # Accessing the model configuration
|
199
|
+
>>> configuration = model.config
|
200
|
+
```"""
|
201
|
+
|
202
|
+
model_type = "qwen2_5_vl"
|
203
|
+
sub_configs = {"vision_config": Qwen2_5_VLVisionConfig}
|
204
|
+
keys_to_ignore_at_inference = ["past_key_values"]
|
205
|
+
# Default tensor parallel plan for base model `Qwen2_5_VL`
|
206
|
+
base_model_tp_plan = {
|
207
|
+
"layers.*.self_attn.q_proj": "colwise",
|
208
|
+
"layers.*.self_attn.k_proj": "colwise",
|
209
|
+
"layers.*.self_attn.v_proj": "colwise",
|
210
|
+
"layers.*.self_attn.o_proj": "rowwise",
|
211
|
+
"layers.*.mlp.gate_proj": "colwise",
|
212
|
+
"layers.*.mlp.up_proj": "colwise",
|
213
|
+
"layers.*.mlp.down_proj": "rowwise",
|
214
|
+
}
|
215
|
+
|
216
|
+
def __init__(
|
217
|
+
self,
|
218
|
+
vocab_size=152064,
|
219
|
+
hidden_size=8192,
|
220
|
+
intermediate_size=29568,
|
221
|
+
num_hidden_layers=80,
|
222
|
+
num_attention_heads=64,
|
223
|
+
num_key_value_heads=8,
|
224
|
+
hidden_act="silu",
|
225
|
+
max_position_embeddings=32768,
|
226
|
+
initializer_range=0.02,
|
227
|
+
rms_norm_eps=1e-05,
|
228
|
+
use_cache=True,
|
229
|
+
tie_word_embeddings=False,
|
230
|
+
rope_theta=1000000.0,
|
231
|
+
use_sliding_window=False,
|
232
|
+
sliding_window=4096,
|
233
|
+
max_window_layers=80,
|
234
|
+
attention_dropout=0.0,
|
235
|
+
vision_config=None,
|
236
|
+
rope_scaling=None,
|
237
|
+
**kwargs,
|
238
|
+
):
|
239
|
+
if isinstance(vision_config, dict):
|
240
|
+
self.vision_config = self.sub_configs["vision_config"](**vision_config)
|
241
|
+
elif vision_config is None:
|
242
|
+
self.vision_config = self.sub_configs["vision_config"]()
|
243
|
+
|
244
|
+
self.vocab_size = vocab_size
|
245
|
+
self.max_position_embeddings = max_position_embeddings
|
246
|
+
self.hidden_size = hidden_size
|
247
|
+
self.intermediate_size = intermediate_size
|
248
|
+
self.num_hidden_layers = num_hidden_layers
|
249
|
+
self.num_attention_heads = num_attention_heads
|
250
|
+
self.use_sliding_window = use_sliding_window
|
251
|
+
self.sliding_window = sliding_window
|
252
|
+
self.max_window_layers = max_window_layers
|
253
|
+
|
254
|
+
# for backward compatibility
|
255
|
+
if num_key_value_heads is None:
|
256
|
+
num_key_value_heads = num_attention_heads
|
257
|
+
|
258
|
+
self.num_key_value_heads = num_key_value_heads
|
259
|
+
self.hidden_act = hidden_act
|
260
|
+
self.initializer_range = initializer_range
|
261
|
+
self.rms_norm_eps = rms_norm_eps
|
262
|
+
self.use_cache = use_cache
|
263
|
+
self.rope_theta = rope_theta
|
264
|
+
self.attention_dropout = attention_dropout
|
265
|
+
self.rope_scaling = rope_scaling
|
266
|
+
|
267
|
+
# Validate the correctness of rotary position embeddings parameters
|
268
|
+
# BC: if there is a 'type' field, move it to 'rope_type'.
|
269
|
+
# and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
|
270
|
+
# one can set it to "linear"/"dynamic" etc. to have scaled RoPE
|
271
|
+
# TODO: @raushan update config in the hub
|
272
|
+
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
273
|
+
if self.rope_scaling["type"] == "mrope":
|
274
|
+
self.rope_scaling["type"] = "default"
|
275
|
+
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
276
|
+
rope_config_validation(self, ignore_keys={"mrope_section"})
|
277
|
+
|
278
|
+
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
279
|
+
|
280
|
+
|
281
|
+
# FIXME: workaround of obsolete transformers version
|
282
|
+
|
283
|
+
|
284
|
+
class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
|
285
|
+
fps: Union[List[float], float]
|
286
|
+
|
287
|
+
|
288
|
+
class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
|
289
|
+
videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
|
290
|
+
_defaults = {
|
291
|
+
"text_kwargs": {
|
292
|
+
"padding": False,
|
293
|
+
},
|
294
|
+
"videos_kwargs": {"fps": 2.0},
|
295
|
+
}
|
296
|
+
|
297
|
+
|
298
|
+
class Qwen2_5_VLProcessor(ProcessorMixin):
|
299
|
+
r"""
|
300
|
+
Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
|
301
|
+
[`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
|
302
|
+
[`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
|
303
|
+
Args:
|
304
|
+
image_processor ([`Qwen2VLImageProcessor`], *optional*):
|
305
|
+
The image processor is a required input.
|
306
|
+
tokenizer ([`Qwen2TokenizerFast`], *optional*):
|
307
|
+
The tokenizer is a required input.
|
308
|
+
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
|
309
|
+
in a chat into a tokenizable string.
|
310
|
+
"""
|
311
|
+
|
312
|
+
attributes = ["image_processor", "tokenizer"]
|
313
|
+
valid_kwargs = ["chat_template"]
|
314
|
+
|
315
|
+
image_processor_class = "AutoImageProcessor"
|
316
|
+
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
|
317
|
+
|
318
|
+
def __init__(
|
319
|
+
self, image_processor=None, tokenizer=None, chat_template=None, **kwargs
|
320
|
+
):
|
321
|
+
self.image_token = (
|
322
|
+
"<|image_pad|>"
|
323
|
+
if not hasattr(tokenizer, "image_token")
|
324
|
+
else tokenizer.image_token
|
325
|
+
)
|
326
|
+
self.video_token = (
|
327
|
+
"<|video_pad|>"
|
328
|
+
if not hasattr(tokenizer, "video_token")
|
329
|
+
else tokenizer.video_token
|
330
|
+
)
|
331
|
+
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
332
|
+
|
333
|
+
def __call__(
|
334
|
+
self,
|
335
|
+
images: ImageInput = None,
|
336
|
+
text: Union[
|
337
|
+
TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
|
338
|
+
] = None,
|
339
|
+
videos: VideoInput = None,
|
340
|
+
**kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
|
341
|
+
) -> BatchFeature:
|
342
|
+
"""
|
343
|
+
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
344
|
+
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
345
|
+
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
|
346
|
+
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
|
347
|
+
|
348
|
+
Args:
|
349
|
+
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
350
|
+
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
351
|
+
tensor. Both channels-first and channels-last formats are supported.
|
352
|
+
text (`str`, `List[str]`, `List[List[str]]`):
|
353
|
+
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
354
|
+
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
355
|
+
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
356
|
+
videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
357
|
+
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
358
|
+
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
359
|
+
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
360
|
+
If set, will return tensors of a particular framework. Acceptable values are:
|
361
|
+
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
362
|
+
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
363
|
+
- `'np'`: Return NumPy `np.ndarray` objects.
|
364
|
+
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
365
|
+
|
366
|
+
Returns:
|
367
|
+
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
368
|
+
|
369
|
+
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
370
|
+
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
371
|
+
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
372
|
+
`None`).
|
373
|
+
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
374
|
+
- **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
|
375
|
+
- **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
|
376
|
+
- **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
|
377
|
+
- **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
|
378
|
+
"""
|
379
|
+
output_kwargs = self._merge_kwargs(
|
380
|
+
Qwen2_5_VLProcessorKwargs,
|
381
|
+
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
382
|
+
**kwargs,
|
383
|
+
)
|
384
|
+
if images is not None:
|
385
|
+
image_inputs = self.image_processor(
|
386
|
+
images=images, videos=None, **output_kwargs["images_kwargs"]
|
387
|
+
)
|
388
|
+
image_grid_thw = image_inputs["image_grid_thw"]
|
389
|
+
else:
|
390
|
+
image_inputs = {}
|
391
|
+
image_grid_thw = None
|
392
|
+
|
393
|
+
if videos is not None:
|
394
|
+
videos_inputs = self.image_processor(
|
395
|
+
images=None, videos=videos, **output_kwargs["images_kwargs"]
|
396
|
+
)
|
397
|
+
video_grid_thw = videos_inputs["video_grid_thw"]
|
398
|
+
|
399
|
+
fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
|
400
|
+
if isinstance(fps, (int, float)):
|
401
|
+
second_per_grid_ts = [
|
402
|
+
self.image_processor.temporal_patch_size / fps
|
403
|
+
] * len(video_grid_thw)
|
404
|
+
elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
|
405
|
+
second_per_grid_ts = [
|
406
|
+
self.image_processor.temporal_patch_size / tmp for tmp in fps
|
407
|
+
]
|
408
|
+
else:
|
409
|
+
raise ValueError(
|
410
|
+
f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
|
411
|
+
)
|
412
|
+
videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
|
413
|
+
|
414
|
+
else:
|
415
|
+
videos_inputs = {}
|
416
|
+
video_grid_thw = None
|
417
|
+
|
418
|
+
if not isinstance(text, list):
|
419
|
+
text = [text]
|
420
|
+
|
421
|
+
if image_grid_thw is not None:
|
422
|
+
merge_length = self.image_processor.merge_size**2
|
423
|
+
index = 0
|
424
|
+
for i in range(len(text)):
|
425
|
+
while self.image_token in text[i]:
|
426
|
+
text[i] = text[i].replace(
|
427
|
+
self.image_token,
|
428
|
+
"<|placeholder|>"
|
429
|
+
* (image_grid_thw[index].prod() // merge_length),
|
430
|
+
1,
|
431
|
+
)
|
432
|
+
index += 1
|
433
|
+
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
434
|
+
|
435
|
+
if video_grid_thw is not None:
|
436
|
+
merge_length = self.image_processor.merge_size**2
|
437
|
+
index = 0
|
438
|
+
for i in range(len(text)):
|
439
|
+
while self.video_token in text[i]:
|
440
|
+
text[i] = text[i].replace(
|
441
|
+
self.video_token,
|
442
|
+
"<|placeholder|>"
|
443
|
+
* (video_grid_thw[index].prod() // merge_length),
|
444
|
+
1,
|
445
|
+
)
|
446
|
+
index += 1
|
447
|
+
text[i] = text[i].replace("<|placeholder|>", self.video_token)
|
448
|
+
|
449
|
+
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
450
|
+
|
451
|
+
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
|
452
|
+
|
453
|
+
def batch_decode(self, *args, **kwargs):
|
454
|
+
"""
|
455
|
+
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
|
456
|
+
refer to the docstring of this method for more information.
|
457
|
+
"""
|
458
|
+
return self.tokenizer.batch_decode(*args, **kwargs)
|
459
|
+
|
460
|
+
def decode(self, *args, **kwargs):
|
461
|
+
"""
|
462
|
+
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
463
|
+
the docstring of this method for more information.
|
464
|
+
"""
|
465
|
+
return self.tokenizer.decode(*args, **kwargs)
|
466
|
+
|
467
|
+
def post_process_image_text_to_text(self, generated_outputs):
|
468
|
+
"""
|
469
|
+
Post-process the output of the model to decode the text.
|
470
|
+
|
471
|
+
Args:
|
472
|
+
generated_outputs (`torch.Tensor` or `np.ndarray`):
|
473
|
+
The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
|
474
|
+
or `(sequence_length,)`.
|
475
|
+
|
476
|
+
Returns:
|
477
|
+
`List[str]`: The decoded text.
|
478
|
+
"""
|
479
|
+
return self.tokenizer.batch_decode(
|
480
|
+
generated_outputs,
|
481
|
+
skip_special_tokens=True,
|
482
|
+
clean_up_tokenization_spaces=False,
|
483
|
+
)
|
484
|
+
|
485
|
+
@property
|
486
|
+
def model_input_names(self):
|
487
|
+
tokenizer_input_names = self.tokenizer.model_input_names
|
488
|
+
image_processor_input_names = self.image_processor.model_input_names
|
489
|
+
names_from_processor = list(
|
490
|
+
dict.fromkeys(tokenizer_input_names + image_processor_input_names)
|
491
|
+
)
|
492
|
+
return names_from_processor + ["second_per_grid_ts"]
|
493
|
+
|
494
|
+
|
495
|
+
class Qwen2_5_VLImageProcessor(BaseImageProcessor):
|
496
|
+
r"""
|
497
|
+
Constructs a Qwen2.5-VL image processor that dynamically resizes images based on the original images.
|
498
|
+
|
499
|
+
Args:
|
500
|
+
do_resize (`bool`, *optional*, defaults to `True`):
|
501
|
+
Whether to resize the image's (height, width) dimensions.
|
502
|
+
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
503
|
+
Resampling filter to use when resizing the image.
|
504
|
+
do_rescale (`bool`, *optional*, defaults to `True`):
|
505
|
+
Whether to rescale the image by the specified scale `rescale_factor`.
|
506
|
+
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
507
|
+
Scale factor to use if rescaling the image.
|
508
|
+
do_normalize (`bool`, *optional*, defaults to `True`):
|
509
|
+
Whether to normalize the image.
|
510
|
+
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
|
511
|
+
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
512
|
+
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
|
513
|
+
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
514
|
+
do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
515
|
+
Whether to convert the image to RGB.
|
516
|
+
min_pixels (`int`, *optional*, defaults to `56 * 56`):
|
517
|
+
The min pixels of the image to resize the image.
|
518
|
+
max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
|
519
|
+
The max pixels of the image to resize the image.
|
520
|
+
patch_size (`int`, *optional*, defaults to 14):
|
521
|
+
The spacial patch size of the vision encoder.
|
522
|
+
temporal_patch_size (`int`, *optional*, defaults to 2):
|
523
|
+
The temporal patch size of the vision encoder.
|
524
|
+
merge_size (`int`, *optional*, defaults to 2):
|
525
|
+
The merge size of the vision encoder to llm encoder.
|
526
|
+
"""
|
527
|
+
|
528
|
+
model_input_names = [
|
529
|
+
"pixel_values",
|
530
|
+
"image_grid_thw",
|
531
|
+
"pixel_values_videos",
|
532
|
+
"video_grid_thw",
|
533
|
+
"second_per_grid_ts",
|
534
|
+
]
|
535
|
+
|
536
|
+
def __init__(
|
537
|
+
self,
|
538
|
+
do_resize: bool = True,
|
539
|
+
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
540
|
+
do_rescale: bool = True,
|
541
|
+
rescale_factor: Union[int, float] = 1 / 255,
|
542
|
+
do_normalize: bool = True,
|
543
|
+
image_mean: Optional[Union[float, List[float]]] = None,
|
544
|
+
image_std: Optional[Union[float, List[float]]] = None,
|
545
|
+
do_convert_rgb: bool = True,
|
546
|
+
min_pixels: int = 56 * 56,
|
547
|
+
max_pixels: int = 28 * 28 * 1280,
|
548
|
+
patch_size: int = 14,
|
549
|
+
temporal_patch_size: int = 2,
|
550
|
+
merge_size: int = 2,
|
551
|
+
**kwargs,
|
552
|
+
) -> None:
|
553
|
+
super().__init__(**kwargs)
|
554
|
+
self.do_resize = do_resize
|
555
|
+
self.resample = resample
|
556
|
+
self.do_rescale = do_rescale
|
557
|
+
self.rescale_factor = rescale_factor
|
558
|
+
self.do_normalize = do_normalize
|
559
|
+
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
560
|
+
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
561
|
+
self.min_pixels = min_pixels
|
562
|
+
self.max_pixels = max_pixels
|
563
|
+
self.patch_size = patch_size
|
564
|
+
self.temporal_patch_size = temporal_patch_size
|
565
|
+
self.merge_size = merge_size
|
566
|
+
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
|
567
|
+
self.do_convert_rgb = do_convert_rgb
|
568
|
+
|
569
|
+
def rescale(
|
570
|
+
self,
|
571
|
+
image: np.ndarray,
|
572
|
+
scale: float,
|
573
|
+
data_format: Optional[Union[str, ChannelDimension]] = None,
|
574
|
+
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
575
|
+
**kwargs,
|
576
|
+
) -> np.ndarray:
|
577
|
+
"""
|
578
|
+
Rescale an image by a scale factor. image = image * scale.
|
579
|
+
|
580
|
+
Args:
|
581
|
+
image (`np.ndarray`):
|
582
|
+
Image to rescale.
|
583
|
+
scale (`float`):
|
584
|
+
The scaling factor to rescale pixel values by.
|
585
|
+
data_format (`str` or `ChannelDimension`, *optional*):
|
586
|
+
The channel dimension format for the output image. If unset, the channel dimension format of the input
|
587
|
+
image is used. Can be one of:
|
588
|
+
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
589
|
+
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
590
|
+
input_data_format (`ChannelDimension` or `str`, *optional*):
|
591
|
+
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
592
|
+
from the input image. Can be one of:
|
593
|
+
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
594
|
+
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
595
|
+
|
596
|
+
Returns:
|
597
|
+
`np.ndarray`: The rescaled image.
|
598
|
+
"""
|
599
|
+
return rescale(
|
600
|
+
image,
|
601
|
+
scale=scale,
|
602
|
+
data_format=data_format,
|
603
|
+
input_data_format=input_data_format,
|
604
|
+
**kwargs,
|
605
|
+
)
|
606
|
+
|
607
|
+
def normalize(
|
608
|
+
self,
|
609
|
+
image: np.ndarray,
|
610
|
+
mean: Union[float, Iterable[float]],
|
611
|
+
std: Union[float, Iterable[float]],
|
612
|
+
data_format: Optional[Union[str, ChannelDimension]] = None,
|
613
|
+
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
614
|
+
**kwargs,
|
615
|
+
) -> np.ndarray:
|
616
|
+
"""
|
617
|
+
Normalize an image. image = (image - image_mean) / image_std.
|
618
|
+
|
619
|
+
Args:
|
620
|
+
image (`np.ndarray`):
|
621
|
+
Image to normalize.
|
622
|
+
mean (`float` or `Iterable[float]`):
|
623
|
+
Image mean to use for normalization.
|
624
|
+
std (`float` or `Iterable[float]`):
|
625
|
+
Image standard deviation to use for normalization.
|
626
|
+
data_format (`str` or `ChannelDimension`, *optional*):
|
627
|
+
The channel dimension format for the output image. If unset, the channel dimension format of the input
|
628
|
+
image is used. Can be one of:
|
629
|
+
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
630
|
+
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
631
|
+
input_data_format (`ChannelDimension` or `str`, *optional*):
|
632
|
+
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
633
|
+
from the input image. Can be one of:
|
634
|
+
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
635
|
+
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
636
|
+
|
637
|
+
Returns:
|
638
|
+
`np.ndarray`: The normalized image.
|
639
|
+
"""
|
640
|
+
return normalize(
|
641
|
+
image,
|
642
|
+
mean=mean,
|
643
|
+
std=std,
|
644
|
+
data_format=data_format,
|
645
|
+
input_data_format=input_data_format,
|
646
|
+
**kwargs,
|
647
|
+
)
|
648
|
+
|
649
|
+
def _preprocess(
|
650
|
+
self,
|
651
|
+
images: Union[ImageInput, VideoInput],
|
652
|
+
do_resize: bool = None,
|
653
|
+
resample: PILImageResampling = None,
|
654
|
+
do_rescale: bool = None,
|
655
|
+
rescale_factor: float = None,
|
656
|
+
do_normalize: bool = None,
|
657
|
+
image_mean: Optional[Union[float, List[float]]] = None,
|
658
|
+
image_std: Optional[Union[float, List[float]]] = None,
|
659
|
+
do_convert_rgb: bool = None,
|
660
|
+
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
661
|
+
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
662
|
+
):
|
663
|
+
"""
|
664
|
+
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
665
|
+
|
666
|
+
Args:
|
667
|
+
images (`ImageInput`):
|
668
|
+
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
|
669
|
+
vision_info (`List[Dict]`, *optional*):
|
670
|
+
Optional list of dictionaries containing additional information about vision inputs.
|
671
|
+
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
672
|
+
Whether to resize the image.
|
673
|
+
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
|
674
|
+
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
|
675
|
+
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
676
|
+
Whether to rescale the image.
|
677
|
+
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
678
|
+
Scale factor to use if rescaling the image.
|
679
|
+
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
680
|
+
Whether to normalize the image.
|
681
|
+
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
682
|
+
Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
683
|
+
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
684
|
+
Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
685
|
+
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
686
|
+
Whether to convert the image to RGB.
|
687
|
+
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
|
688
|
+
The channel dimension format for the output image. Can be one of:
|
689
|
+
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
690
|
+
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
691
|
+
- Unset: Use the channel dimension format of the input image.
|
692
|
+
input_data_format (`ChannelDimension` or `str`, *optional*):
|
693
|
+
The channel dimension format for the input image. Can be one of:
|
694
|
+
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
695
|
+
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
696
|
+
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
697
|
+
"""
|
698
|
+
images = make_list_of_images(images)
|
699
|
+
|
700
|
+
if do_convert_rgb:
|
701
|
+
images = [convert_to_rgb(image) for image in images]
|
702
|
+
|
703
|
+
# All transformations expect numpy arrays.
|
704
|
+
images = [to_numpy_array(image) for image in images]
|
705
|
+
|
706
|
+
if input_data_format is None:
|
707
|
+
# We assume that all images have the same channel dimension format.
|
708
|
+
input_data_format = infer_channel_dimension_format(images[0])
|
709
|
+
|
710
|
+
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
711
|
+
resized_height, resized_width = height, width
|
712
|
+
processed_images = []
|
713
|
+
for image in images:
|
714
|
+
if do_resize:
|
715
|
+
resized_height, resized_width = smart_resize(
|
716
|
+
height,
|
717
|
+
width,
|
718
|
+
factor=self.patch_size * self.merge_size,
|
719
|
+
min_pixels=self.min_pixels,
|
720
|
+
max_pixels=self.max_pixels,
|
721
|
+
)
|
722
|
+
image = resize(
|
723
|
+
image,
|
724
|
+
size=(resized_height, resized_width),
|
725
|
+
resample=resample,
|
726
|
+
input_data_format=input_data_format,
|
727
|
+
)
|
728
|
+
|
729
|
+
if do_rescale:
|
730
|
+
image = self.rescale(
|
731
|
+
image, scale=rescale_factor, input_data_format=input_data_format
|
732
|
+
)
|
733
|
+
|
734
|
+
if do_normalize:
|
735
|
+
image = self.normalize(
|
736
|
+
image=image,
|
737
|
+
mean=image_mean,
|
738
|
+
std=image_std,
|
739
|
+
input_data_format=input_data_format,
|
740
|
+
)
|
741
|
+
|
742
|
+
image = to_channel_dimension_format(
|
743
|
+
image, data_format, input_channel_dim=input_data_format
|
744
|
+
)
|
745
|
+
processed_images.append(image)
|
746
|
+
|
747
|
+
patches = np.array(processed_images)
|
748
|
+
if data_format == ChannelDimension.LAST:
|
749
|
+
patches = patches.transpose(0, 3, 1, 2)
|
750
|
+
if patches.shape[0] % self.temporal_patch_size != 0:
|
751
|
+
repeats = np.repeat(
|
752
|
+
patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0
|
753
|
+
)
|
754
|
+
patches = np.concatenate([patches, repeats], axis=0)
|
755
|
+
channel = patches.shape[1]
|
756
|
+
grid_t = patches.shape[0] // self.temporal_patch_size
|
757
|
+
grid_h, grid_w = (
|
758
|
+
resized_height // self.patch_size,
|
759
|
+
resized_width // self.patch_size,
|
760
|
+
)
|
761
|
+
patches = patches.reshape(
|
762
|
+
grid_t,
|
763
|
+
self.temporal_patch_size,
|
764
|
+
channel,
|
765
|
+
grid_h // self.merge_size,
|
766
|
+
self.merge_size,
|
767
|
+
self.patch_size,
|
768
|
+
grid_w // self.merge_size,
|
769
|
+
self.merge_size,
|
770
|
+
self.patch_size,
|
771
|
+
)
|
772
|
+
patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
|
773
|
+
flatten_patches = patches.reshape(
|
774
|
+
grid_t * grid_h * grid_w,
|
775
|
+
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
|
776
|
+
)
|
777
|
+
|
778
|
+
return flatten_patches, (grid_t, grid_h, grid_w)
|
779
|
+
|
780
|
+
def preprocess(
|
781
|
+
self,
|
782
|
+
images: ImageInput,
|
783
|
+
videos: VideoInput = None,
|
784
|
+
do_resize: bool = None,
|
785
|
+
size: Dict[str, int] = None,
|
786
|
+
resample: PILImageResampling = None,
|
787
|
+
do_rescale: bool = None,
|
788
|
+
rescale_factor: float = None,
|
789
|
+
do_normalize: bool = None,
|
790
|
+
image_mean: Optional[Union[float, List[float]]] = None,
|
791
|
+
image_std: Optional[Union[float, List[float]]] = None,
|
792
|
+
do_convert_rgb: bool = None,
|
793
|
+
return_tensors: Optional[Union[str, TensorType]] = None,
|
794
|
+
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
795
|
+
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
796
|
+
):
|
797
|
+
"""
|
798
|
+
Args:
|
799
|
+
images (`ImageInput`):
|
800
|
+
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
801
|
+
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
802
|
+
videos (`VideoInput`):
|
803
|
+
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
|
804
|
+
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
|
805
|
+
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
806
|
+
Whether to resize the image.
|
807
|
+
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
808
|
+
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
|
809
|
+
the longest edge resized to keep the input aspect ratio.
|
810
|
+
resample (`int`, *optional*, defaults to `self.resample`):
|
811
|
+
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
|
812
|
+
has an effect if `do_resize` is set to `True`.
|
813
|
+
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
814
|
+
Whether to rescale the image.
|
815
|
+
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
816
|
+
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
817
|
+
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
818
|
+
Whether to normalize the image.
|
819
|
+
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
820
|
+
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
|
821
|
+
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
822
|
+
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
|
823
|
+
`True`.
|
824
|
+
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
825
|
+
Whether to convert the image to RGB.
|
826
|
+
return_tensors (`str` or `TensorType`, *optional*):
|
827
|
+
The type of tensors to return. Can be one of:
|
828
|
+
- Unset: Return a list of `np.ndarray`.
|
829
|
+
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
|
830
|
+
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
|
831
|
+
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
832
|
+
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
|
833
|
+
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
834
|
+
The channel dimension format for the output image. Can be one of:
|
835
|
+
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
836
|
+
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
837
|
+
- Unset: Use the channel dimension format of the input image.
|
838
|
+
input_data_format (`ChannelDimension` or `str`, *optional*):
|
839
|
+
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
840
|
+
from the input image. Can be one of:
|
841
|
+
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
842
|
+
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
843
|
+
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
844
|
+
|
845
|
+
"""
|
846
|
+
do_resize = do_resize if do_resize is not None else self.do_resize
|
847
|
+
size = size if size is not None else self.size
|
848
|
+
resample = resample if resample is not None else self.resample
|
849
|
+
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
850
|
+
rescale_factor = (
|
851
|
+
rescale_factor if rescale_factor is not None else self.rescale_factor
|
852
|
+
)
|
853
|
+
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
854
|
+
image_mean = image_mean if image_mean is not None else self.image_mean
|
855
|
+
image_std = image_std if image_std is not None else self.image_std
|
856
|
+
do_convert_rgb = (
|
857
|
+
do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
858
|
+
)
|
859
|
+
|
860
|
+
def make_flat_list_of_images(
|
861
|
+
images: Union[List[ImageInput], ImageInput],
|
862
|
+
) -> ImageInput:
|
863
|
+
"""
|
864
|
+
Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1.
|
865
|
+
If the input is a nested list of images, it is converted to a flat list of images.
|
866
|
+
Args:
|
867
|
+
images (`Union[List[ImageInput], ImageInput]`):
|
868
|
+
The input image.
|
869
|
+
Returns:
|
870
|
+
list: A list of images or a 4d array of images.
|
871
|
+
"""
|
872
|
+
# If the input is a nested list of images, we flatten it
|
873
|
+
if (
|
874
|
+
isinstance(images, (list, tuple))
|
875
|
+
and all(isinstance(images_i, (list, tuple)) for images_i in images)
|
876
|
+
and all(is_valid_list_of_images(images_i) for images_i in images)
|
877
|
+
):
|
878
|
+
return [img for img_list in images for img in img_list]
|
879
|
+
|
880
|
+
if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
|
881
|
+
if is_pil_image(images[0]) or images[0].ndim == 3:
|
882
|
+
return images
|
883
|
+
if images[0].ndim == 4:
|
884
|
+
return [img for img_list in images for img in img_list]
|
885
|
+
|
886
|
+
if is_valid_image(images):
|
887
|
+
if is_pil_image(images) or images.ndim == 3:
|
888
|
+
return [images]
|
889
|
+
if images.ndim == 4:
|
890
|
+
return list(images)
|
891
|
+
|
892
|
+
raise ValueError(f"Could not make a flat list of images from {images}")
|
893
|
+
|
894
|
+
def make_batched_videos(videos) -> VideoInput:
|
895
|
+
"""
|
896
|
+
Ensure that the input is a list of videos.
|
897
|
+
Args:
|
898
|
+
videos (`VideoInput`):
|
899
|
+
Video or videos to turn into a list of videos.
|
900
|
+
Returns:
|
901
|
+
list: A list of videos.
|
902
|
+
"""
|
903
|
+
if (
|
904
|
+
isinstance(videos, (list, tuple))
|
905
|
+
and isinstance(videos[0], (list, tuple))
|
906
|
+
and is_valid_image(videos[0][0])
|
907
|
+
):
|
908
|
+
# case 1: nested batch of videos so we flatten it
|
909
|
+
if not is_pil_image(videos[0][0]) and videos[0][0].ndim == 4:
|
910
|
+
videos = [
|
911
|
+
[video for batch_list in batched_videos for video in batch_list]
|
912
|
+
for batched_videos in videos
|
913
|
+
]
|
914
|
+
# case 2: list of videos represented as list of video frames
|
915
|
+
return videos
|
916
|
+
|
917
|
+
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
|
918
|
+
if is_pil_image(videos[0]) or videos[0].ndim == 3:
|
919
|
+
return [videos]
|
920
|
+
elif videos[0].ndim == 4:
|
921
|
+
return [list(video) for video in videos]
|
922
|
+
|
923
|
+
elif is_valid_image(videos):
|
924
|
+
if is_pil_image(videos) or videos.ndim == 3:
|
925
|
+
return [[videos]]
|
926
|
+
elif videos.ndim == 4:
|
927
|
+
return [list(videos)]
|
928
|
+
|
929
|
+
raise ValueError(f"Could not make batched video from {videos}")
|
930
|
+
|
931
|
+
if images is not None:
|
932
|
+
images = make_flat_list_of_images(images)
|
933
|
+
if videos is not None:
|
934
|
+
videos = make_batched_videos(videos)
|
935
|
+
|
936
|
+
if images is not None and not valid_images(images):
|
937
|
+
raise ValueError(
|
938
|
+
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
939
|
+
"torch.Tensor, tf.Tensor or jax.ndarray."
|
940
|
+
)
|
941
|
+
|
942
|
+
validate_preprocess_arguments(
|
943
|
+
rescale_factor=rescale_factor,
|
944
|
+
do_normalize=do_normalize,
|
945
|
+
image_mean=image_mean,
|
946
|
+
image_std=image_std,
|
947
|
+
do_resize=do_resize,
|
948
|
+
size=size,
|
949
|
+
resample=resample,
|
950
|
+
)
|
951
|
+
|
952
|
+
if images is not None:
|
953
|
+
pixel_values, vision_grid_thws = [], []
|
954
|
+
for image in images:
|
955
|
+
patches, image_grid_thw = self._preprocess(
|
956
|
+
image,
|
957
|
+
do_resize=do_resize,
|
958
|
+
resample=resample,
|
959
|
+
do_rescale=do_rescale,
|
960
|
+
rescale_factor=rescale_factor,
|
961
|
+
do_normalize=do_normalize,
|
962
|
+
image_mean=image_mean,
|
963
|
+
image_std=image_std,
|
964
|
+
data_format=data_format,
|
965
|
+
do_convert_rgb=do_convert_rgb,
|
966
|
+
input_data_format=input_data_format,
|
967
|
+
)
|
968
|
+
pixel_values.extend(patches)
|
969
|
+
vision_grid_thws.append(image_grid_thw)
|
970
|
+
pixel_values = np.array(pixel_values)
|
971
|
+
vision_grid_thws = np.array(vision_grid_thws)
|
972
|
+
data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
|
973
|
+
|
974
|
+
if videos is not None:
|
975
|
+
pixel_values, vision_grid_thws = [], []
|
976
|
+
for images in videos:
|
977
|
+
patches, video_grid_thw = self._preprocess(
|
978
|
+
images,
|
979
|
+
do_resize=do_resize,
|
980
|
+
resample=resample,
|
981
|
+
do_rescale=do_rescale,
|
982
|
+
rescale_factor=rescale_factor,
|
983
|
+
do_normalize=do_normalize,
|
984
|
+
image_mean=image_mean,
|
985
|
+
image_std=image_std,
|
986
|
+
data_format=data_format,
|
987
|
+
do_convert_rgb=do_convert_rgb,
|
988
|
+
input_data_format=input_data_format,
|
989
|
+
)
|
990
|
+
pixel_values.extend(patches)
|
991
|
+
vision_grid_thws.append(video_grid_thw)
|
992
|
+
pixel_values = np.array(pixel_values)
|
993
|
+
vision_grid_thws = np.array(vision_grid_thws)
|
994
|
+
data = {
|
995
|
+
"pixel_values_videos": pixel_values,
|
996
|
+
"video_grid_thw": vision_grid_thws,
|
997
|
+
}
|
998
|
+
|
999
|
+
return BatchFeature(data=data, tensor_type=return_tensors)
|
1000
|
+
|
1001
|
+
|
1002
|
+
AutoImageProcessor.register(Qwen2_5_VLConfig, Qwen2_5_VLImageProcessor)
|
1003
|
+
AutoProcessor.register(Qwen2_5_VLConfig, Qwen2_5_VLProcessor)
|