sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +6 -0
  3. sglang/bench_one_batch.py +1 -1
  4. sglang/bench_one_batch_server.py +1 -1
  5. sglang/bench_serving.py +26 -4
  6. sglang/check_env.py +3 -4
  7. sglang/lang/backend/openai.py +18 -5
  8. sglang/lang/chat_template.py +28 -7
  9. sglang/lang/interpreter.py +7 -3
  10. sglang/lang/ir.py +10 -0
  11. sglang/srt/_custom_ops.py +1 -1
  12. sglang/srt/code_completion_parser.py +174 -0
  13. sglang/srt/configs/__init__.py +2 -6
  14. sglang/srt/configs/deepseekvl2.py +676 -0
  15. sglang/srt/configs/janus_pro.py +3 -4
  16. sglang/srt/configs/load_config.py +1 -0
  17. sglang/srt/configs/model_config.py +49 -8
  18. sglang/srt/configs/utils.py +25 -0
  19. sglang/srt/connector/__init__.py +51 -0
  20. sglang/srt/connector/base_connector.py +112 -0
  21. sglang/srt/connector/redis.py +85 -0
  22. sglang/srt/connector/s3.py +122 -0
  23. sglang/srt/connector/serde/__init__.py +31 -0
  24. sglang/srt/connector/serde/safe_serde.py +29 -0
  25. sglang/srt/connector/serde/serde.py +43 -0
  26. sglang/srt/connector/utils.py +35 -0
  27. sglang/srt/conversation.py +88 -0
  28. sglang/srt/disaggregation/conn.py +81 -0
  29. sglang/srt/disaggregation/decode.py +495 -0
  30. sglang/srt/disaggregation/mini_lb.py +285 -0
  31. sglang/srt/disaggregation/prefill.py +249 -0
  32. sglang/srt/disaggregation/utils.py +44 -0
  33. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
  34. sglang/srt/distributed/parallel_state.py +42 -8
  35. sglang/srt/entrypoints/engine.py +55 -5
  36. sglang/srt/entrypoints/http_server.py +78 -13
  37. sglang/srt/entrypoints/verl_engine.py +2 -0
  38. sglang/srt/function_call_parser.py +133 -55
  39. sglang/srt/hf_transformers_utils.py +28 -3
  40. sglang/srt/layers/activation.py +4 -2
  41. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  42. sglang/srt/layers/attention/flashattention_backend.py +434 -0
  43. sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  44. sglang/srt/layers/attention/flashmla_backend.py +284 -0
  45. sglang/srt/layers/attention/triton_backend.py +171 -38
  46. sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
  47. sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
  48. sglang/srt/layers/attention/utils.py +53 -0
  49. sglang/srt/layers/attention/vision.py +9 -28
  50. sglang/srt/layers/dp_attention.py +41 -19
  51. sglang/srt/layers/layernorm.py +24 -2
  52. sglang/srt/layers/linear.py +17 -5
  53. sglang/srt/layers/logits_processor.py +25 -7
  54. sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
  55. sglang/srt/layers/moe/ep_moe/layer.py +273 -1
  56. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
  57. sglang/srt/layers/moe/fused_moe_native.py +2 -1
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
  63. sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
  64. sglang/srt/layers/moe/topk.py +60 -20
  65. sglang/srt/layers/parameter.py +1 -1
  66. sglang/srt/layers/quantization/__init__.py +80 -53
  67. sglang/srt/layers/quantization/awq.py +200 -0
  68. sglang/srt/layers/quantization/base_config.py +5 -0
  69. sglang/srt/layers/quantization/blockwise_int8.py +1 -1
  70. sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  71. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
  72. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
  73. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
  74. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
  75. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
  76. sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
  77. sglang/srt/layers/quantization/fp8.py +76 -34
  78. sglang/srt/layers/quantization/fp8_kernel.py +25 -8
  79. sglang/srt/layers/quantization/fp8_utils.py +284 -28
  80. sglang/srt/layers/quantization/gptq.py +36 -19
  81. sglang/srt/layers/quantization/kv_cache.py +98 -0
  82. sglang/srt/layers/quantization/modelopt_quant.py +9 -7
  83. sglang/srt/layers/quantization/utils.py +153 -0
  84. sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
  85. sglang/srt/layers/rotary_embedding.py +78 -87
  86. sglang/srt/layers/sampler.py +1 -1
  87. sglang/srt/lora/backend/base_backend.py +4 -4
  88. sglang/srt/lora/backend/flashinfer_backend.py +12 -9
  89. sglang/srt/lora/backend/triton_backend.py +5 -8
  90. sglang/srt/lora/layers.py +87 -33
  91. sglang/srt/lora/lora.py +2 -22
  92. sglang/srt/lora/lora_manager.py +67 -30
  93. sglang/srt/lora/mem_pool.py +117 -52
  94. sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
  95. sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
  96. sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
  97. sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
  98. sglang/srt/lora/utils.py +18 -1
  99. sglang/srt/managers/cache_controller.py +2 -5
  100. sglang/srt/managers/data_parallel_controller.py +30 -8
  101. sglang/srt/managers/expert_distribution.py +81 -0
  102. sglang/srt/managers/io_struct.py +43 -5
  103. sglang/srt/managers/mm_utils.py +373 -0
  104. sglang/srt/managers/multimodal_processor.py +68 -0
  105. sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
  106. sglang/srt/managers/multimodal_processors/clip.py +63 -0
  107. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
  108. sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
  109. sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
  110. sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
  111. sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
  112. sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
  113. sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
  114. sglang/srt/managers/schedule_batch.py +134 -30
  115. sglang/srt/managers/scheduler.py +290 -31
  116. sglang/srt/managers/session_controller.py +1 -1
  117. sglang/srt/managers/tokenizer_manager.py +59 -24
  118. sglang/srt/managers/tp_worker.py +4 -1
  119. sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
  120. sglang/srt/managers/utils.py +6 -1
  121. sglang/srt/mem_cache/hiradix_cache.py +18 -7
  122. sglang/srt/mem_cache/memory_pool.py +255 -98
  123. sglang/srt/mem_cache/paged_allocator.py +2 -2
  124. sglang/srt/mem_cache/radix_cache.py +4 -4
  125. sglang/srt/model_executor/cuda_graph_runner.py +36 -21
  126. sglang/srt/model_executor/forward_batch_info.py +68 -11
  127. sglang/srt/model_executor/model_runner.py +75 -8
  128. sglang/srt/model_loader/loader.py +171 -3
  129. sglang/srt/model_loader/weight_utils.py +51 -3
  130. sglang/srt/models/clip.py +563 -0
  131. sglang/srt/models/deepseek_janus_pro.py +31 -88
  132. sglang/srt/models/deepseek_nextn.py +22 -10
  133. sglang/srt/models/deepseek_v2.py +329 -73
  134. sglang/srt/models/deepseek_vl2.py +358 -0
  135. sglang/srt/models/gemma3_causal.py +694 -0
  136. sglang/srt/models/gemma3_mm.py +468 -0
  137. sglang/srt/models/llama.py +47 -7
  138. sglang/srt/models/llama_eagle.py +1 -0
  139. sglang/srt/models/llama_eagle3.py +196 -0
  140. sglang/srt/models/llava.py +3 -3
  141. sglang/srt/models/llavavid.py +3 -3
  142. sglang/srt/models/minicpmo.py +1995 -0
  143. sglang/srt/models/minicpmv.py +62 -137
  144. sglang/srt/models/mllama.py +4 -4
  145. sglang/srt/models/phi3_small.py +1 -1
  146. sglang/srt/models/qwen2.py +3 -0
  147. sglang/srt/models/qwen2_5_vl.py +68 -146
  148. sglang/srt/models/qwen2_classification.py +75 -0
  149. sglang/srt/models/qwen2_moe.py +9 -1
  150. sglang/srt/models/qwen2_vl.py +25 -63
  151. sglang/srt/openai_api/adapter.py +201 -104
  152. sglang/srt/openai_api/protocol.py +33 -7
  153. sglang/srt/patch_torch.py +71 -0
  154. sglang/srt/sampling/sampling_batch_info.py +1 -1
  155. sglang/srt/sampling/sampling_params.py +6 -6
  156. sglang/srt/server_args.py +114 -14
  157. sglang/srt/speculative/build_eagle_tree.py +7 -347
  158. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
  159. sglang/srt/speculative/eagle_utils.py +208 -252
  160. sglang/srt/speculative/eagle_worker.py +140 -54
  161. sglang/srt/speculative/spec_info.py +6 -1
  162. sglang/srt/torch_memory_saver_adapter.py +22 -0
  163. sglang/srt/utils.py +215 -21
  164. sglang/test/__init__.py +0 -0
  165. sglang/test/attention/__init__.py +0 -0
  166. sglang/test/attention/test_flashattn_backend.py +312 -0
  167. sglang/test/runners.py +29 -2
  168. sglang/test/test_activation.py +2 -1
  169. sglang/test/test_block_fp8.py +5 -4
  170. sglang/test/test_block_fp8_ep.py +2 -1
  171. sglang/test/test_dynamic_grad_mode.py +58 -0
  172. sglang/test/test_layernorm.py +3 -2
  173. sglang/test/test_utils.py +56 -5
  174. sglang/utils.py +31 -0
  175. sglang/version.py +1 -1
  176. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +16 -8
  177. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +180 -132
  178. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +1 -1
  179. sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
  180. sglang/srt/managers/image_processor.py +0 -55
  181. sglang/srt/managers/image_processors/base_image_processor.py +0 -219
  182. sglang/srt/managers/image_processors/minicpmv.py +0 -86
  183. sglang/srt/managers/multi_modality_padding.py +0 -134
  184. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info/licenses}/LICENSE +0 -0
  185. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0
@@ -1,1006 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
3
- # All rights reserved.
4
- #
5
- # Licensed under the Apache License, Version 2.0 (the "License");
6
- # you may not use this file except in compliance with the License.
7
- # You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
- """Qwen2VL model configuration"""
17
- from typing import Dict, Iterable, List, Optional, Union
18
-
19
- import numpy as np
20
- from transformers import (
21
- AutoImageProcessor,
22
- AutoProcessor,
23
- BaseImageProcessor,
24
- BatchFeature,
25
- PretrainedConfig,
26
- ProcessorMixin,
27
- TensorType,
28
- )
29
- from transformers.image_transforms import (
30
- convert_to_rgb,
31
- normalize,
32
- rescale,
33
- resize,
34
- to_channel_dimension_format,
35
- )
36
- from transformers.image_utils import (
37
- ChannelDimension,
38
- ImageInput,
39
- PILImageResampling,
40
- VideoInput,
41
- get_image_size,
42
- infer_channel_dimension_format,
43
- is_pil_image,
44
- is_valid_image,
45
- make_list_of_images,
46
- to_numpy_array,
47
- valid_images,
48
- validate_preprocess_arguments,
49
- )
50
- from transformers.modeling_rope_utils import rope_config_validation
51
- from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
52
- from transformers.processing_utils import ProcessingKwargs, Unpack, VideosKwargs
53
- from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
54
- from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
55
-
56
-
57
- def is_valid_list_of_images(images: List):
58
- return images and all(is_valid_image(image) for image in images)
59
-
60
-
61
- class Qwen2_5_VLVisionConfig(PretrainedConfig):
62
- model_type = "qwen2_5_vl"
63
- base_config_key = "vision_config"
64
-
65
- def __init__(
66
- self,
67
- depth=32,
68
- hidden_size=3584,
69
- hidden_act="silu",
70
- intermediate_size=3420,
71
- num_heads=16,
72
- in_channels=3,
73
- patch_size=14,
74
- spatial_merge_size=2,
75
- temporal_patch_size=2,
76
- tokens_per_second=4,
77
- window_size=112,
78
- out_hidden_size=3584,
79
- fullatt_block_indexes=[7, 15, 23, 31],
80
- **kwargs,
81
- ):
82
- super().__init__(**kwargs)
83
-
84
- self.depth = depth
85
- self.hidden_size = hidden_size
86
- self.hidden_act = hidden_act
87
- self.intermediate_size = intermediate_size
88
- self.num_heads = num_heads
89
- self.in_channels = in_channels
90
- self.patch_size = patch_size
91
- self.spatial_merge_size = spatial_merge_size
92
- self.temporal_patch_size = temporal_patch_size
93
- self.tokens_per_second = tokens_per_second
94
- self.window_size = window_size
95
- self.fullatt_block_indexes = fullatt_block_indexes
96
- self.out_hidden_size = out_hidden_size
97
-
98
-
99
- class Qwen2_5_VLConfig(PretrainedConfig):
100
- r"""
101
- This is the configuration class to store the configuration of a [`Qwen2_5_VLModel`]. It is used to instantiate a
102
- Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
103
- with the defaults will yield a similar configuration to that of
104
- Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
105
-
106
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
107
- documentation from [`PretrainedConfig`] for more information.
108
-
109
-
110
- Args:
111
- vocab_size (`int`, *optional*, defaults to 152064):
112
- Vocabulary size of the Qwen2_5_VL model. Defines the number of different tokens that can be represented by the
113
- `inputs_ids` passed when calling [`Qwen2_5_VLModel`]
114
- hidden_size (`int`, *optional*, defaults to 8192):
115
- Dimension of the hidden representations.
116
- intermediate_size (`int`, *optional*, defaults to 29568):
117
- Dimension of the MLP representations.
118
- num_hidden_layers (`int`, *optional*, defaults to 80):
119
- Number of hidden layers in the Transformer encoder.
120
- num_attention_heads (`int`, *optional*, defaults to 64):
121
- Number of attention heads for each attention layer in the Transformer encoder.
122
- num_key_value_heads (`int`, *optional*, defaults to 8):
123
- This is the number of key_value heads that should be used to implement Grouped Query Attention. If
124
- `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
125
- `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
126
- converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
127
- by meanpooling all the original heads within that group. For more details checkout [this
128
- paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
129
- hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
130
- The non-linear activation function (function or string) in the decoder.
131
- max_position_embeddings (`int`, *optional*, defaults to 32768):
132
- The maximum sequence length that this model might ever be used with.
133
- initializer_range (`float`, *optional*, defaults to 0.02):
134
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
135
- rms_norm_eps (`float`, *optional*, defaults to 1e-05):
136
- The epsilon used by the rms normalization layers.
137
- use_cache (`bool`, *optional*, defaults to `True`):
138
- Whether or not the model should return the last key/values attentions (not used by all models). Only
139
- relevant if `config.is_decoder=True`.
140
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
141
- Whether the model's input and output word embeddings should be tied.
142
- rope_theta (`float`, *optional*, defaults to 1000000.0):
143
- The base period of the RoPE embeddings.
144
- use_sliding_window (`bool`, *optional*, defaults to `False`):
145
- Whether to use sliding window attention.
146
- sliding_window (`int`, *optional*, defaults to 4096):
147
- Sliding window attention (SWA) window size. If not specified, will default to `4096`.
148
- max_window_layers (`int`, *optional*, defaults to 80):
149
- The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
150
- attention_dropout (`float`, *optional*, defaults to 0.0):
151
- The dropout ratio for the attention probabilities.
152
- vision_config (`Dict`, *optional*):
153
- The config for the visual encoder initialization.
154
- rope_scaling (`Dict`, *optional*):
155
- Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
156
- and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
157
- accordingly.
158
- Expected contents:
159
- `rope_type` (`str`):
160
- The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
161
- 'llama3'], with 'default' being the original RoPE implementation.
162
- `factor` (`float`, *optional*):
163
- Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
164
- most scaling types, a `factor` of x will enable the model to handle sequences of length x *
165
- original maximum pre-trained length.
166
- `original_max_position_embeddings` (`int`, *optional*):
167
- Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
168
- pretraining.
169
- `attention_factor` (`float`, *optional*):
170
- Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
171
- computation. If unspecified, it defaults to value recommended by the implementation, using the
172
- `factor` field to infer the suggested value.
173
- `beta_fast` (`float`, *optional*):
174
- Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
175
- ramp function. If unspecified, it defaults to 32.
176
- `beta_slow` (`float`, *optional*):
177
- Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
178
- ramp function. If unspecified, it defaults to 1.
179
- `short_factor` (`List[float]`, *optional*):
180
- Only used with 'longrope'. The scaling factor to be applied to short contexts (<
181
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
182
- size divided by the number of attention heads divided by 2
183
- `long_factor` (`List[float]`, *optional*):
184
- Only used with 'longrope'. The scaling factor to be applied to long contexts (<
185
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
186
- size divided by the number of attention heads divided by 2
187
- `low_freq_factor` (`float`, *optional*):
188
- Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
189
- `high_freq_factor` (`float`, *optional*):
190
- Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
191
-
192
- ```python
193
- >>> from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLConfig
194
-
195
- >>> # Initializing a Qwen2_5_VL style configuration
196
- >>> configuration = Qwen2_5_VLConfig()
197
-
198
- >>> # Initializing a model from the Qwen2-VL-7B style configuration
199
- >>> model = Qwen2_5_VLForConditionalGeneration(configuration)
200
-
201
- >>> # Accessing the model configuration
202
- >>> configuration = model.config
203
- ```"""
204
-
205
- model_type = "qwen2_5_vl"
206
- sub_configs = {"vision_config": Qwen2_5_VLVisionConfig}
207
- keys_to_ignore_at_inference = ["past_key_values"]
208
- # Default tensor parallel plan for base model `Qwen2_5_VL`
209
- base_model_tp_plan = {
210
- "layers.*.self_attn.q_proj": "colwise",
211
- "layers.*.self_attn.k_proj": "colwise",
212
- "layers.*.self_attn.v_proj": "colwise",
213
- "layers.*.self_attn.o_proj": "rowwise",
214
- "layers.*.mlp.gate_proj": "colwise",
215
- "layers.*.mlp.up_proj": "colwise",
216
- "layers.*.mlp.down_proj": "rowwise",
217
- }
218
-
219
- def __init__(
220
- self,
221
- vocab_size=152064,
222
- hidden_size=8192,
223
- intermediate_size=29568,
224
- num_hidden_layers=80,
225
- num_attention_heads=64,
226
- num_key_value_heads=8,
227
- hidden_act="silu",
228
- max_position_embeddings=32768,
229
- initializer_range=0.02,
230
- rms_norm_eps=1e-05,
231
- use_cache=True,
232
- tie_word_embeddings=False,
233
- rope_theta=1000000.0,
234
- use_sliding_window=False,
235
- sliding_window=4096,
236
- max_window_layers=80,
237
- attention_dropout=0.0,
238
- vision_config=None,
239
- rope_scaling=None,
240
- **kwargs,
241
- ):
242
- if isinstance(vision_config, dict):
243
- self.vision_config = self.sub_configs["vision_config"](**vision_config)
244
- elif vision_config is None:
245
- self.vision_config = self.sub_configs["vision_config"]()
246
-
247
- self.vocab_size = vocab_size
248
- self.max_position_embeddings = max_position_embeddings
249
- self.hidden_size = hidden_size
250
- self.intermediate_size = intermediate_size
251
- self.num_hidden_layers = num_hidden_layers
252
- self.num_attention_heads = num_attention_heads
253
- self.use_sliding_window = use_sliding_window
254
- self.sliding_window = sliding_window
255
- self.max_window_layers = max_window_layers
256
-
257
- # for backward compatibility
258
- if num_key_value_heads is None:
259
- num_key_value_heads = num_attention_heads
260
-
261
- self.num_key_value_heads = num_key_value_heads
262
- self.hidden_act = hidden_act
263
- self.initializer_range = initializer_range
264
- self.rms_norm_eps = rms_norm_eps
265
- self.use_cache = use_cache
266
- self.rope_theta = rope_theta
267
- self.attention_dropout = attention_dropout
268
- self.rope_scaling = rope_scaling
269
-
270
- # Validate the correctness of rotary position embeddings parameters
271
- # BC: if there is a 'type' field, move it to 'rope_type'.
272
- # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
273
- # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
274
- # TODO: @raushan update config in the hub
275
- if self.rope_scaling is not None and "type" in self.rope_scaling:
276
- if self.rope_scaling["type"] == "mrope":
277
- self.rope_scaling["type"] = "default"
278
- self.rope_scaling["rope_type"] = self.rope_scaling["type"]
279
- rope_config_validation(self, ignore_keys={"mrope_section"})
280
-
281
- super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
282
-
283
-
284
- # FIXME: workaround of obsolete transformers version
285
-
286
-
287
- class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
288
- fps: Union[List[float], float]
289
-
290
-
291
- class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
292
- videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
293
- _defaults = {
294
- "text_kwargs": {
295
- "padding": False,
296
- },
297
- "videos_kwargs": {"fps": 2.0},
298
- }
299
-
300
-
301
- class Qwen2_5_VLProcessor(ProcessorMixin):
302
- r"""
303
- Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
304
- [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
305
- [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
306
- Args:
307
- image_processor ([`Qwen2VLImageProcessor`], *optional*):
308
- The image processor is a required input.
309
- tokenizer ([`Qwen2TokenizerFast`], *optional*):
310
- The tokenizer is a required input.
311
- chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
312
- in a chat into a tokenizable string.
313
- """
314
-
315
- attributes = ["image_processor", "tokenizer"]
316
- valid_kwargs = ["chat_template"]
317
-
318
- image_processor_class = "AutoImageProcessor"
319
- tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
320
-
321
- def __init__(
322
- self, image_processor=None, tokenizer=None, chat_template=None, **kwargs
323
- ):
324
- self.image_token = (
325
- "<|image_pad|>"
326
- if not hasattr(tokenizer, "image_token")
327
- else tokenizer.image_token
328
- )
329
- self.video_token = (
330
- "<|video_pad|>"
331
- if not hasattr(tokenizer, "video_token")
332
- else tokenizer.video_token
333
- )
334
- super().__init__(image_processor, tokenizer, chat_template=chat_template)
335
-
336
- def __call__(
337
- self,
338
- images: ImageInput = None,
339
- text: Union[
340
- TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
341
- ] = None,
342
- videos: VideoInput = None,
343
- **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
344
- ) -> BatchFeature:
345
- """
346
- Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
347
- and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
348
- the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
349
- Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
350
-
351
- Args:
352
- images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
353
- The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
354
- tensor. Both channels-first and channels-last formats are supported.
355
- text (`str`, `List[str]`, `List[List[str]]`):
356
- The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
357
- (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
358
- `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
359
- videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
360
- The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
361
- tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
362
- return_tensors (`str` or [`~utils.TensorType`], *optional*):
363
- If set, will return tensors of a particular framework. Acceptable values are:
364
- - `'tf'`: Return TensorFlow `tf.constant` objects.
365
- - `'pt'`: Return PyTorch `torch.Tensor` objects.
366
- - `'np'`: Return NumPy `np.ndarray` objects.
367
- - `'jax'`: Return JAX `jnp.ndarray` objects.
368
-
369
- Returns:
370
- [`BatchFeature`]: A [`BatchFeature`] with the following fields:
371
-
372
- - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
373
- - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
374
- `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
375
- `None`).
376
- - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
377
- - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
378
- - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
379
- - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
380
- - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
381
- """
382
- output_kwargs = self._merge_kwargs(
383
- Qwen2_5_VLProcessorKwargs,
384
- tokenizer_init_kwargs=self.tokenizer.init_kwargs,
385
- **kwargs,
386
- )
387
- if images is not None:
388
- image_inputs = self.image_processor(
389
- images=images, videos=None, **output_kwargs["images_kwargs"]
390
- )
391
- image_grid_thw = image_inputs["image_grid_thw"]
392
- else:
393
- image_inputs = {}
394
- image_grid_thw = None
395
-
396
- if videos is not None:
397
- videos_inputs = self.image_processor(
398
- images=None, videos=videos, **output_kwargs["images_kwargs"]
399
- )
400
- video_grid_thw = videos_inputs["video_grid_thw"]
401
-
402
- fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
403
- if isinstance(fps, (int, float)):
404
- second_per_grid_ts = [
405
- self.image_processor.temporal_patch_size / fps
406
- ] * len(video_grid_thw)
407
- elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
408
- second_per_grid_ts = [
409
- self.image_processor.temporal_patch_size / tmp for tmp in fps
410
- ]
411
- else:
412
- raise ValueError(
413
- f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
414
- )
415
- videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
416
-
417
- else:
418
- videos_inputs = {}
419
- video_grid_thw = None
420
-
421
- if not isinstance(text, list):
422
- text = [text]
423
-
424
- if image_grid_thw is not None:
425
- merge_length = self.image_processor.merge_size**2
426
- index = 0
427
- for i in range(len(text)):
428
- while self.image_token in text[i]:
429
- text[i] = text[i].replace(
430
- self.image_token,
431
- "<|placeholder|>"
432
- * (image_grid_thw[index].prod() // merge_length),
433
- 1,
434
- )
435
- index += 1
436
- text[i] = text[i].replace("<|placeholder|>", self.image_token)
437
-
438
- if video_grid_thw is not None:
439
- merge_length = self.image_processor.merge_size**2
440
- index = 0
441
- for i in range(len(text)):
442
- while self.video_token in text[i]:
443
- text[i] = text[i].replace(
444
- self.video_token,
445
- "<|placeholder|>"
446
- * (video_grid_thw[index].prod() // merge_length),
447
- 1,
448
- )
449
- index += 1
450
- text[i] = text[i].replace("<|placeholder|>", self.video_token)
451
-
452
- text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
453
-
454
- return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
455
-
456
- def batch_decode(self, *args, **kwargs):
457
- """
458
- This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
459
- refer to the docstring of this method for more information.
460
- """
461
- return self.tokenizer.batch_decode(*args, **kwargs)
462
-
463
- def decode(self, *args, **kwargs):
464
- """
465
- This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
466
- the docstring of this method for more information.
467
- """
468
- return self.tokenizer.decode(*args, **kwargs)
469
-
470
- def post_process_image_text_to_text(self, generated_outputs):
471
- """
472
- Post-process the output of the model to decode the text.
473
-
474
- Args:
475
- generated_outputs (`torch.Tensor` or `np.ndarray`):
476
- The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
477
- or `(sequence_length,)`.
478
-
479
- Returns:
480
- `List[str]`: The decoded text.
481
- """
482
- return self.tokenizer.batch_decode(
483
- generated_outputs,
484
- skip_special_tokens=True,
485
- clean_up_tokenization_spaces=False,
486
- )
487
-
488
- @property
489
- def model_input_names(self):
490
- tokenizer_input_names = self.tokenizer.model_input_names
491
- image_processor_input_names = self.image_processor.model_input_names
492
- names_from_processor = list(
493
- dict.fromkeys(tokenizer_input_names + image_processor_input_names)
494
- )
495
- return names_from_processor + ["second_per_grid_ts"]
496
-
497
-
498
- class Qwen2_5_VLImageProcessor(BaseImageProcessor):
499
- r"""
500
- Constructs a Qwen2.5-VL image processor that dynamically resizes images based on the original images.
501
-
502
- Args:
503
- do_resize (`bool`, *optional*, defaults to `True`):
504
- Whether to resize the image's (height, width) dimensions.
505
- resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
506
- Resampling filter to use when resizing the image.
507
- do_rescale (`bool`, *optional*, defaults to `True`):
508
- Whether to rescale the image by the specified scale `rescale_factor`.
509
- rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
510
- Scale factor to use if rescaling the image.
511
- do_normalize (`bool`, *optional*, defaults to `True`):
512
- Whether to normalize the image.
513
- image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
514
- Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
515
- image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
516
- Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
517
- do_convert_rgb (`bool`, *optional*, defaults to `True`):
518
- Whether to convert the image to RGB.
519
- min_pixels (`int`, *optional*, defaults to `56 * 56`):
520
- The min pixels of the image to resize the image.
521
- max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
522
- The max pixels of the image to resize the image.
523
- patch_size (`int`, *optional*, defaults to 14):
524
- The spacial patch size of the vision encoder.
525
- temporal_patch_size (`int`, *optional*, defaults to 2):
526
- The temporal patch size of the vision encoder.
527
- merge_size (`int`, *optional*, defaults to 2):
528
- The merge size of the vision encoder to llm encoder.
529
- """
530
-
531
- model_input_names = [
532
- "pixel_values",
533
- "image_grid_thw",
534
- "pixel_values_videos",
535
- "video_grid_thw",
536
- "second_per_grid_ts",
537
- ]
538
-
539
- def __init__(
540
- self,
541
- do_resize: bool = True,
542
- resample: PILImageResampling = PILImageResampling.BICUBIC,
543
- do_rescale: bool = True,
544
- rescale_factor: Union[int, float] = 1 / 255,
545
- do_normalize: bool = True,
546
- image_mean: Optional[Union[float, List[float]]] = None,
547
- image_std: Optional[Union[float, List[float]]] = None,
548
- do_convert_rgb: bool = True,
549
- min_pixels: int = 56 * 56,
550
- max_pixels: int = 28 * 28 * 1280,
551
- patch_size: int = 14,
552
- temporal_patch_size: int = 2,
553
- merge_size: int = 2,
554
- **kwargs,
555
- ) -> None:
556
- super().__init__(**kwargs)
557
- self.do_resize = do_resize
558
- self.resample = resample
559
- self.do_rescale = do_rescale
560
- self.rescale_factor = rescale_factor
561
- self.do_normalize = do_normalize
562
- self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
563
- self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
564
- self.min_pixels = min_pixels
565
- self.max_pixels = max_pixels
566
- self.patch_size = patch_size
567
- self.temporal_patch_size = temporal_patch_size
568
- self.merge_size = merge_size
569
- self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
570
- self.do_convert_rgb = do_convert_rgb
571
-
572
- def rescale(
573
- self,
574
- image: np.ndarray,
575
- scale: float,
576
- data_format: Optional[Union[str, ChannelDimension]] = None,
577
- input_data_format: Optional[Union[str, ChannelDimension]] = None,
578
- **kwargs,
579
- ) -> np.ndarray:
580
- """
581
- Rescale an image by a scale factor. image = image * scale.
582
-
583
- Args:
584
- image (`np.ndarray`):
585
- Image to rescale.
586
- scale (`float`):
587
- The scaling factor to rescale pixel values by.
588
- data_format (`str` or `ChannelDimension`, *optional*):
589
- The channel dimension format for the output image. If unset, the channel dimension format of the input
590
- image is used. Can be one of:
591
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
592
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
593
- input_data_format (`ChannelDimension` or `str`, *optional*):
594
- The channel dimension format for the input image. If unset, the channel dimension format is inferred
595
- from the input image. Can be one of:
596
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
597
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
598
-
599
- Returns:
600
- `np.ndarray`: The rescaled image.
601
- """
602
- return rescale(
603
- image,
604
- scale=scale,
605
- data_format=data_format,
606
- input_data_format=input_data_format,
607
- **kwargs,
608
- )
609
-
610
- def normalize(
611
- self,
612
- image: np.ndarray,
613
- mean: Union[float, Iterable[float]],
614
- std: Union[float, Iterable[float]],
615
- data_format: Optional[Union[str, ChannelDimension]] = None,
616
- input_data_format: Optional[Union[str, ChannelDimension]] = None,
617
- **kwargs,
618
- ) -> np.ndarray:
619
- """
620
- Normalize an image. image = (image - image_mean) / image_std.
621
-
622
- Args:
623
- image (`np.ndarray`):
624
- Image to normalize.
625
- mean (`float` or `Iterable[float]`):
626
- Image mean to use for normalization.
627
- std (`float` or `Iterable[float]`):
628
- Image standard deviation to use for normalization.
629
- data_format (`str` or `ChannelDimension`, *optional*):
630
- The channel dimension format for the output image. If unset, the channel dimension format of the input
631
- image is used. Can be one of:
632
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
633
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
634
- input_data_format (`ChannelDimension` or `str`, *optional*):
635
- The channel dimension format for the input image. If unset, the channel dimension format is inferred
636
- from the input image. Can be one of:
637
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
638
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
639
-
640
- Returns:
641
- `np.ndarray`: The normalized image.
642
- """
643
- return normalize(
644
- image,
645
- mean=mean,
646
- std=std,
647
- data_format=data_format,
648
- input_data_format=input_data_format,
649
- **kwargs,
650
- )
651
-
652
- def _preprocess(
653
- self,
654
- images: Union[ImageInput, VideoInput],
655
- do_resize: bool = None,
656
- resample: PILImageResampling = None,
657
- do_rescale: bool = None,
658
- rescale_factor: float = None,
659
- do_normalize: bool = None,
660
- image_mean: Optional[Union[float, List[float]]] = None,
661
- image_std: Optional[Union[float, List[float]]] = None,
662
- do_convert_rgb: bool = None,
663
- data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
664
- input_data_format: Optional[Union[str, ChannelDimension]] = None,
665
- ):
666
- """
667
- Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
668
-
669
- Args:
670
- images (`ImageInput`):
671
- Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
672
- vision_info (`List[Dict]`, *optional*):
673
- Optional list of dictionaries containing additional information about vision inputs.
674
- do_resize (`bool`, *optional*, defaults to `self.do_resize`):
675
- Whether to resize the image.
676
- resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
677
- Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
678
- do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
679
- Whether to rescale the image.
680
- rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
681
- Scale factor to use if rescaling the image.
682
- do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
683
- Whether to normalize the image.
684
- image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
685
- Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
686
- image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
687
- Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
688
- do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
689
- Whether to convert the image to RGB.
690
- data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
691
- The channel dimension format for the output image. Can be one of:
692
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
693
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
694
- - Unset: Use the channel dimension format of the input image.
695
- input_data_format (`ChannelDimension` or `str`, *optional*):
696
- The channel dimension format for the input image. Can be one of:
697
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
698
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
699
- - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
700
- """
701
- images = make_list_of_images(images)
702
-
703
- if do_convert_rgb:
704
- images = [convert_to_rgb(image) for image in images]
705
-
706
- # All transformations expect numpy arrays.
707
- images = [to_numpy_array(image) for image in images]
708
-
709
- if input_data_format is None:
710
- # We assume that all images have the same channel dimension format.
711
- input_data_format = infer_channel_dimension_format(images[0])
712
-
713
- height, width = get_image_size(images[0], channel_dim=input_data_format)
714
- resized_height, resized_width = height, width
715
- processed_images = []
716
- for image in images:
717
- if do_resize:
718
- resized_height, resized_width = smart_resize(
719
- height,
720
- width,
721
- factor=self.patch_size * self.merge_size,
722
- min_pixels=self.min_pixels,
723
- max_pixels=self.max_pixels,
724
- )
725
- image = resize(
726
- image,
727
- size=(resized_height, resized_width),
728
- resample=resample,
729
- input_data_format=input_data_format,
730
- )
731
-
732
- if do_rescale:
733
- image = self.rescale(
734
- image, scale=rescale_factor, input_data_format=input_data_format
735
- )
736
-
737
- if do_normalize:
738
- image = self.normalize(
739
- image=image,
740
- mean=image_mean,
741
- std=image_std,
742
- input_data_format=input_data_format,
743
- )
744
-
745
- image = to_channel_dimension_format(
746
- image, data_format, input_channel_dim=input_data_format
747
- )
748
- processed_images.append(image)
749
-
750
- patches = np.array(processed_images)
751
- if data_format == ChannelDimension.LAST:
752
- patches = patches.transpose(0, 3, 1, 2)
753
- if patches.shape[0] % self.temporal_patch_size != 0:
754
- repeats = np.repeat(
755
- patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0
756
- )
757
- patches = np.concatenate([patches, repeats], axis=0)
758
- channel = patches.shape[1]
759
- grid_t = patches.shape[0] // self.temporal_patch_size
760
- grid_h, grid_w = (
761
- resized_height // self.patch_size,
762
- resized_width // self.patch_size,
763
- )
764
- patches = patches.reshape(
765
- grid_t,
766
- self.temporal_patch_size,
767
- channel,
768
- grid_h // self.merge_size,
769
- self.merge_size,
770
- self.patch_size,
771
- grid_w // self.merge_size,
772
- self.merge_size,
773
- self.patch_size,
774
- )
775
- patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
776
- flatten_patches = patches.reshape(
777
- grid_t * grid_h * grid_w,
778
- channel * self.temporal_patch_size * self.patch_size * self.patch_size,
779
- )
780
-
781
- return flatten_patches, (grid_t, grid_h, grid_w)
782
-
783
- def preprocess(
784
- self,
785
- images: ImageInput,
786
- videos: VideoInput = None,
787
- do_resize: bool = None,
788
- size: Dict[str, int] = None,
789
- resample: PILImageResampling = None,
790
- do_rescale: bool = None,
791
- rescale_factor: float = None,
792
- do_normalize: bool = None,
793
- image_mean: Optional[Union[float, List[float]]] = None,
794
- image_std: Optional[Union[float, List[float]]] = None,
795
- do_convert_rgb: bool = None,
796
- return_tensors: Optional[Union[str, TensorType]] = None,
797
- data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
798
- input_data_format: Optional[Union[str, ChannelDimension]] = None,
799
- ):
800
- """
801
- Args:
802
- images (`ImageInput`):
803
- Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
804
- passing in images with pixel values between 0 and 1, set `do_rescale=False`.
805
- videos (`VideoInput`):
806
- Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
807
- passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
808
- do_resize (`bool`, *optional*, defaults to `self.do_resize`):
809
- Whether to resize the image.
810
- size (`Dict[str, int]`, *optional*, defaults to `self.size`):
811
- Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
812
- the longest edge resized to keep the input aspect ratio.
813
- resample (`int`, *optional*, defaults to `self.resample`):
814
- Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
815
- has an effect if `do_resize` is set to `True`.
816
- do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
817
- Whether to rescale the image.
818
- rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
819
- Rescale factor to rescale the image by if `do_rescale` is set to `True`.
820
- do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
821
- Whether to normalize the image.
822
- image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
823
- Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
824
- image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
825
- Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
826
- `True`.
827
- do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
828
- Whether to convert the image to RGB.
829
- return_tensors (`str` or `TensorType`, *optional*):
830
- The type of tensors to return. Can be one of:
831
- - Unset: Return a list of `np.ndarray`.
832
- - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
833
- - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
834
- - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
835
- - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
836
- data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
837
- The channel dimension format for the output image. Can be one of:
838
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
839
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
840
- - Unset: Use the channel dimension format of the input image.
841
- input_data_format (`ChannelDimension` or `str`, *optional*):
842
- The channel dimension format for the input image. If unset, the channel dimension format is inferred
843
- from the input image. Can be one of:
844
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
845
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
846
- - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
847
-
848
- """
849
- do_resize = do_resize if do_resize is not None else self.do_resize
850
- size = size if size is not None else self.size
851
- resample = resample if resample is not None else self.resample
852
- do_rescale = do_rescale if do_rescale is not None else self.do_rescale
853
- rescale_factor = (
854
- rescale_factor if rescale_factor is not None else self.rescale_factor
855
- )
856
- do_normalize = do_normalize if do_normalize is not None else self.do_normalize
857
- image_mean = image_mean if image_mean is not None else self.image_mean
858
- image_std = image_std if image_std is not None else self.image_std
859
- do_convert_rgb = (
860
- do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
861
- )
862
-
863
- def make_flat_list_of_images(
864
- images: Union[List[ImageInput], ImageInput],
865
- ) -> ImageInput:
866
- """
867
- Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1.
868
- If the input is a nested list of images, it is converted to a flat list of images.
869
- Args:
870
- images (`Union[List[ImageInput], ImageInput]`):
871
- The input image.
872
- Returns:
873
- list: A list of images or a 4d array of images.
874
- """
875
- # If the input is a nested list of images, we flatten it
876
- if (
877
- isinstance(images, (list, tuple))
878
- and all(isinstance(images_i, (list, tuple)) for images_i in images)
879
- and all(is_valid_list_of_images(images_i) for images_i in images)
880
- ):
881
- return [img for img_list in images for img in img_list]
882
-
883
- if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
884
- if is_pil_image(images[0]) or images[0].ndim == 3:
885
- return images
886
- if images[0].ndim == 4:
887
- return [img for img_list in images for img in img_list]
888
-
889
- if is_valid_image(images):
890
- if is_pil_image(images) or images.ndim == 3:
891
- return [images]
892
- if images.ndim == 4:
893
- return list(images)
894
-
895
- raise ValueError(f"Could not make a flat list of images from {images}")
896
-
897
- def make_batched_videos(videos) -> VideoInput:
898
- """
899
- Ensure that the input is a list of videos.
900
- Args:
901
- videos (`VideoInput`):
902
- Video or videos to turn into a list of videos.
903
- Returns:
904
- list: A list of videos.
905
- """
906
- if (
907
- isinstance(videos, (list, tuple))
908
- and isinstance(videos[0], (list, tuple))
909
- and is_valid_image(videos[0][0])
910
- ):
911
- # case 1: nested batch of videos so we flatten it
912
- if not is_pil_image(videos[0][0]) and videos[0][0].ndim == 4:
913
- videos = [
914
- [video for batch_list in batched_videos for video in batch_list]
915
- for batched_videos in videos
916
- ]
917
- # case 2: list of videos represented as list of video frames
918
- return videos
919
-
920
- elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
921
- if is_pil_image(videos[0]) or videos[0].ndim == 3:
922
- return [videos]
923
- elif videos[0].ndim == 4:
924
- return [list(video) for video in videos]
925
-
926
- elif is_valid_image(videos):
927
- if is_pil_image(videos) or videos.ndim == 3:
928
- return [[videos]]
929
- elif videos.ndim == 4:
930
- return [list(videos)]
931
-
932
- raise ValueError(f"Could not make batched video from {videos}")
933
-
934
- if images is not None:
935
- images = make_flat_list_of_images(images)
936
- if videos is not None:
937
- videos = make_batched_videos(videos)
938
-
939
- if images is not None and not valid_images(images):
940
- raise ValueError(
941
- "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
942
- "torch.Tensor, tf.Tensor or jax.ndarray."
943
- )
944
-
945
- validate_preprocess_arguments(
946
- rescale_factor=rescale_factor,
947
- do_normalize=do_normalize,
948
- image_mean=image_mean,
949
- image_std=image_std,
950
- do_resize=do_resize,
951
- size=size,
952
- resample=resample,
953
- )
954
-
955
- if images is not None:
956
- pixel_values, vision_grid_thws = [], []
957
- for image in images:
958
- patches, image_grid_thw = self._preprocess(
959
- image,
960
- do_resize=do_resize,
961
- resample=resample,
962
- do_rescale=do_rescale,
963
- rescale_factor=rescale_factor,
964
- do_normalize=do_normalize,
965
- image_mean=image_mean,
966
- image_std=image_std,
967
- data_format=data_format,
968
- do_convert_rgb=do_convert_rgb,
969
- input_data_format=input_data_format,
970
- )
971
- pixel_values.extend(patches)
972
- vision_grid_thws.append(image_grid_thw)
973
- pixel_values = np.array(pixel_values)
974
- vision_grid_thws = np.array(vision_grid_thws)
975
- data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
976
-
977
- if videos is not None:
978
- pixel_values, vision_grid_thws = [], []
979
- for images in videos:
980
- patches, video_grid_thw = self._preprocess(
981
- images,
982
- do_resize=do_resize,
983
- resample=resample,
984
- do_rescale=do_rescale,
985
- rescale_factor=rescale_factor,
986
- do_normalize=do_normalize,
987
- image_mean=image_mean,
988
- image_std=image_std,
989
- data_format=data_format,
990
- do_convert_rgb=do_convert_rgb,
991
- input_data_format=input_data_format,
992
- )
993
- pixel_values.extend(patches)
994
- vision_grid_thws.append(video_grid_thw)
995
- pixel_values = np.array(pixel_values)
996
- vision_grid_thws = np.array(vision_grid_thws)
997
- data = {
998
- "pixel_values_videos": pixel_values,
999
- "video_grid_thw": vision_grid_thws,
1000
- }
1001
-
1002
- return BatchFeature(data=data, tensor_type=return_tensors)
1003
-
1004
-
1005
- AutoImageProcessor.register(Qwen2_5_VLConfig, None, Qwen2_5_VLImageProcessor, None)
1006
- AutoProcessor.register(Qwen2_5_VLConfig, Qwen2_5_VLProcessor)