fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (258) hide show
  1. fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
  2. fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
  3. fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
  4. fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
  5. fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
  6. fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
  7. mlx_vlm/__init__.py +16 -0
  8. mlx_vlm/__main__.py +24 -0
  9. mlx_vlm/chat.py +234 -0
  10. mlx_vlm/chat_ui.py +508 -0
  11. mlx_vlm/convert.py +284 -0
  12. mlx_vlm/deprecation.py +52 -0
  13. mlx_vlm/evals/__init__.py +0 -0
  14. mlx_vlm/evals/math_vista.py +565 -0
  15. mlx_vlm/evals/mmmu.py +528 -0
  16. mlx_vlm/evals/mmstar.py +343 -0
  17. mlx_vlm/evals/ocrbench.py +453 -0
  18. mlx_vlm/evals/utils.py +37 -0
  19. mlx_vlm/generate.py +1457 -0
  20. mlx_vlm/lora.py +207 -0
  21. mlx_vlm/models/__init__.py +0 -0
  22. mlx_vlm/models/aya_vision/__init__.py +2 -0
  23. mlx_vlm/models/aya_vision/aya_vision.py +188 -0
  24. mlx_vlm/models/aya_vision/config.py +52 -0
  25. mlx_vlm/models/aya_vision/language.py +202 -0
  26. mlx_vlm/models/aya_vision/vision.py +340 -0
  27. mlx_vlm/models/base.py +356 -0
  28. mlx_vlm/models/cache.py +238 -0
  29. mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
  30. mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
  31. mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
  32. mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
  33. mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
  34. mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
  35. mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
  36. mlx_vlm/models/deepseekocr/__init__.py +2 -0
  37. mlx_vlm/models/deepseekocr/config.py +173 -0
  38. mlx_vlm/models/deepseekocr/conversation.py +264 -0
  39. mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
  40. mlx_vlm/models/deepseekocr/language.py +547 -0
  41. mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
  42. mlx_vlm/models/deepseekocr/sam.py +489 -0
  43. mlx_vlm/models/deepseekocr/vision.py +263 -0
  44. mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
  45. mlx_vlm/models/deepseekocr_2/config.py +216 -0
  46. mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
  47. mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
  48. mlx_vlm/models/deepseekocr_2/vision.py +439 -0
  49. mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
  50. mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
  51. mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
  52. mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
  53. mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
  54. mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
  55. mlx_vlm/models/fastvlm/__init__.py +2 -0
  56. mlx_vlm/models/fastvlm/config.py +79 -0
  57. mlx_vlm/models/fastvlm/fastvlm.py +198 -0
  58. mlx_vlm/models/fastvlm/language.py +49 -0
  59. mlx_vlm/models/fastvlm/vision.py +692 -0
  60. mlx_vlm/models/florence2/__init__.py +2 -0
  61. mlx_vlm/models/florence2/config.py +84 -0
  62. mlx_vlm/models/florence2/florence2.py +383 -0
  63. mlx_vlm/models/florence2/language.py +452 -0
  64. mlx_vlm/models/florence2/processing_florence2.py +30 -0
  65. mlx_vlm/models/florence2/vision.py +552 -0
  66. mlx_vlm/models/gemma3/__init__.py +2 -0
  67. mlx_vlm/models/gemma3/config.py +52 -0
  68. mlx_vlm/models/gemma3/gemma3.py +194 -0
  69. mlx_vlm/models/gemma3/language.py +293 -0
  70. mlx_vlm/models/gemma3/vision.py +215 -0
  71. mlx_vlm/models/gemma3n/__init__.py +2 -0
  72. mlx_vlm/models/gemma3n/audio.py +1038 -0
  73. mlx_vlm/models/gemma3n/config.py +130 -0
  74. mlx_vlm/models/gemma3n/gemma3n.py +322 -0
  75. mlx_vlm/models/gemma3n/language.py +631 -0
  76. mlx_vlm/models/gemma3n/vision.py +994 -0
  77. mlx_vlm/models/glm4v/__init__.py +3 -0
  78. mlx_vlm/models/glm4v/config.py +79 -0
  79. mlx_vlm/models/glm4v/glm4v.py +188 -0
  80. mlx_vlm/models/glm4v/language.py +574 -0
  81. mlx_vlm/models/glm4v/processing.py +220 -0
  82. mlx_vlm/models/glm4v/vision.py +406 -0
  83. mlx_vlm/models/glm4v_moe/__init__.py +3 -0
  84. mlx_vlm/models/glm4v_moe/config.py +81 -0
  85. mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
  86. mlx_vlm/models/glm4v_moe/language.py +674 -0
  87. mlx_vlm/models/glm4v_moe/processing.py +229 -0
  88. mlx_vlm/models/glm4v_moe/vision.py +405 -0
  89. mlx_vlm/models/glm_ocr/__init__.py +3 -0
  90. mlx_vlm/models/glm_ocr/config.py +93 -0
  91. mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
  92. mlx_vlm/models/glm_ocr/language.py +585 -0
  93. mlx_vlm/models/glm_ocr/processing.py +208 -0
  94. mlx_vlm/models/glm_ocr/vision.py +342 -0
  95. mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
  96. mlx_vlm/models/hunyuan_vl/config.py +136 -0
  97. mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
  98. mlx_vlm/models/hunyuan_vl/language.py +509 -0
  99. mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
  100. mlx_vlm/models/hunyuan_vl/vision.py +322 -0
  101. mlx_vlm/models/idefics2/__init__.py +2 -0
  102. mlx_vlm/models/idefics2/config.py +65 -0
  103. mlx_vlm/models/idefics2/idefics2.py +321 -0
  104. mlx_vlm/models/idefics2/language.py +161 -0
  105. mlx_vlm/models/idefics2/vision.py +244 -0
  106. mlx_vlm/models/idefics3/__init__.py +4 -0
  107. mlx_vlm/models/idefics3/config.py +54 -0
  108. mlx_vlm/models/idefics3/idefics3.py +221 -0
  109. mlx_vlm/models/idefics3/language.py +157 -0
  110. mlx_vlm/models/idefics3/vision.py +265 -0
  111. mlx_vlm/models/internvl_chat/__init__.py +3 -0
  112. mlx_vlm/models/internvl_chat/config.py +89 -0
  113. mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
  114. mlx_vlm/models/internvl_chat/language.py +187 -0
  115. mlx_vlm/models/internvl_chat/processor.py +395 -0
  116. mlx_vlm/models/internvl_chat/vision.py +265 -0
  117. mlx_vlm/models/interpolate.py +183 -0
  118. mlx_vlm/models/jina_vlm/__init__.py +3 -0
  119. mlx_vlm/models/jina_vlm/config.py +142 -0
  120. mlx_vlm/models/jina_vlm/image_processor.py +430 -0
  121. mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
  122. mlx_vlm/models/jina_vlm/language.py +272 -0
  123. mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
  124. mlx_vlm/models/jina_vlm/vision.py +202 -0
  125. mlx_vlm/models/kernels.py +447 -0
  126. mlx_vlm/models/kimi_vl/__init__.py +4 -0
  127. mlx_vlm/models/kimi_vl/config.py +84 -0
  128. mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
  129. mlx_vlm/models/kimi_vl/language.py +460 -0
  130. mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
  131. mlx_vlm/models/kimi_vl/vision.py +485 -0
  132. mlx_vlm/models/lfm2_vl/__init__.py +2 -0
  133. mlx_vlm/models/lfm2_vl/config.py +94 -0
  134. mlx_vlm/models/lfm2_vl/language.py +49 -0
  135. mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
  136. mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
  137. mlx_vlm/models/lfm2_vl/vision.py +223 -0
  138. mlx_vlm/models/llama4/__init__.py +2 -0
  139. mlx_vlm/models/llama4/config.py +83 -0
  140. mlx_vlm/models/llama4/language.py +334 -0
  141. mlx_vlm/models/llama4/llama4.py +146 -0
  142. mlx_vlm/models/llama4/vision.py +526 -0
  143. mlx_vlm/models/llava/__init__.py +2 -0
  144. mlx_vlm/models/llava/config.py +61 -0
  145. mlx_vlm/models/llava/language.py +200 -0
  146. mlx_vlm/models/llava/llava.py +132 -0
  147. mlx_vlm/models/llava/vision.py +233 -0
  148. mlx_vlm/models/llava_bunny/__init__.py +2 -0
  149. mlx_vlm/models/llava_bunny/config.py +85 -0
  150. mlx_vlm/models/llava_bunny/language.py +194 -0
  151. mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
  152. mlx_vlm/models/llava_bunny/vision.py +278 -0
  153. mlx_vlm/models/llava_next/__init__.py +2 -0
  154. mlx_vlm/models/llava_next/config.py +60 -0
  155. mlx_vlm/models/llava_next/language.py +192 -0
  156. mlx_vlm/models/llava_next/llava_next.py +138 -0
  157. mlx_vlm/models/llava_next/vision.py +217 -0
  158. mlx_vlm/models/mistral3/__init__.py +2 -0
  159. mlx_vlm/models/mistral3/config.py +59 -0
  160. mlx_vlm/models/mistral3/language.py +269 -0
  161. mlx_vlm/models/mistral3/mistral3.py +383 -0
  162. mlx_vlm/models/mllama/__init__.py +4 -0
  163. mlx_vlm/models/mllama/config.py +74 -0
  164. mlx_vlm/models/mllama/language.py +377 -0
  165. mlx_vlm/models/mllama/mllama.py +210 -0
  166. mlx_vlm/models/mllama/vision.py +458 -0
  167. mlx_vlm/models/molmo/__init__.py +5 -0
  168. mlx_vlm/models/molmo/config.py +93 -0
  169. mlx_vlm/models/molmo/language.py +208 -0
  170. mlx_vlm/models/molmo/molmo.py +108 -0
  171. mlx_vlm/models/molmo/processing_molmo.py +763 -0
  172. mlx_vlm/models/molmo/vision.py +408 -0
  173. mlx_vlm/models/molmo2/__init__.py +6 -0
  174. mlx_vlm/models/molmo2/config.py +137 -0
  175. mlx_vlm/models/molmo2/language.py +206 -0
  176. mlx_vlm/models/molmo2/molmo2.py +330 -0
  177. mlx_vlm/models/molmo2/processing.py +773 -0
  178. mlx_vlm/models/molmo2/vision.py +286 -0
  179. mlx_vlm/models/moondream2/__init__.py +11 -0
  180. mlx_vlm/models/moondream2/config.py +92 -0
  181. mlx_vlm/models/moondream2/image_crops.py +269 -0
  182. mlx_vlm/models/moondream2/language.py +267 -0
  183. mlx_vlm/models/moondream2/moondream2.py +522 -0
  184. mlx_vlm/models/moondream2/processing_moondream.py +144 -0
  185. mlx_vlm/models/moondream2/vision.py +200 -0
  186. mlx_vlm/models/multi_modality/__init__.py +4 -0
  187. mlx_vlm/models/multi_modality/config.py +108 -0
  188. mlx_vlm/models/multi_modality/language.py +191 -0
  189. mlx_vlm/models/multi_modality/multi_modality.py +338 -0
  190. mlx_vlm/models/multi_modality/sam.py +543 -0
  191. mlx_vlm/models/multi_modality/vision.py +450 -0
  192. mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
  193. mlx_vlm/models/paddleocr_vl/config.py +93 -0
  194. mlx_vlm/models/paddleocr_vl/language.py +522 -0
  195. mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
  196. mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
  197. mlx_vlm/models/paddleocr_vl/vision.py +358 -0
  198. mlx_vlm/models/paligemma/__init__.py +4 -0
  199. mlx_vlm/models/paligemma/config.py +50 -0
  200. mlx_vlm/models/paligemma/language.py +253 -0
  201. mlx_vlm/models/paligemma/paligemma.py +140 -0
  202. mlx_vlm/models/paligemma/vision.py +218 -0
  203. mlx_vlm/models/phi3_v/__init__.py +5 -0
  204. mlx_vlm/models/phi3_v/config.py +55 -0
  205. mlx_vlm/models/phi3_v/language.py +2 -0
  206. mlx_vlm/models/phi3_v/phi3_v.py +239 -0
  207. mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
  208. mlx_vlm/models/phi3_v/vision.py +294 -0
  209. mlx_vlm/models/pixtral/__init__.py +4 -0
  210. mlx_vlm/models/pixtral/config.py +69 -0
  211. mlx_vlm/models/pixtral/language.py +195 -0
  212. mlx_vlm/models/pixtral/pixtral.py +208 -0
  213. mlx_vlm/models/pixtral/vision.py +293 -0
  214. mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
  215. mlx_vlm/models/qwen2_5_vl/config.py +90 -0
  216. mlx_vlm/models/qwen2_5_vl/language.py +541 -0
  217. mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
  218. mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
  219. mlx_vlm/models/qwen2_vl/__init__.py +2 -0
  220. mlx_vlm/models/qwen2_vl/config.py +86 -0
  221. mlx_vlm/models/qwen2_vl/language.py +539 -0
  222. mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
  223. mlx_vlm/models/qwen2_vl/vision.py +308 -0
  224. mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
  225. mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
  226. mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
  227. mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
  228. mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
  229. mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
  230. mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
  231. mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
  232. mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
  233. mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
  234. mlx_vlm/models/qwen3_vl/__init__.py +2 -0
  235. mlx_vlm/models/qwen3_vl/config.py +103 -0
  236. mlx_vlm/models/qwen3_vl/language.py +596 -0
  237. mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
  238. mlx_vlm/models/qwen3_vl/vision.py +441 -0
  239. mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
  240. mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
  241. mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
  242. mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
  243. mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
  244. mlx_vlm/models/smolvlm/__init__.py +4 -0
  245. mlx_vlm/models/smolvlm/config.py +59 -0
  246. mlx_vlm/models/smolvlm/smolvlm.py +60 -0
  247. mlx_vlm/prompt_utils.py +565 -0
  248. mlx_vlm/sample_utils.py +39 -0
  249. mlx_vlm/server.py +1107 -0
  250. mlx_vlm/smolvlm_video_generate.py +109 -0
  251. mlx_vlm/tokenizer_utils.py +371 -0
  252. mlx_vlm/trainer/__init__.py +9 -0
  253. mlx_vlm/trainer/lora.py +70 -0
  254. mlx_vlm/trainer/trainer.py +299 -0
  255. mlx_vlm/trainer/utils.py +160 -0
  256. mlx_vlm/utils.py +1339 -0
  257. mlx_vlm/version.py +1 -0
  258. mlx_vlm/video_generate.py +611 -0
@@ -0,0 +1,607 @@
1
+ """Image processor and processor classes for HunyuanVL.
2
+
3
+ Based on the official HuggingFace transformers implementation.
4
+ Handles image preprocessing and tokenization for the HunyuanVL model.
5
+ """
6
+
7
+ import math
8
+ from typing import Dict, List, Optional, Tuple, Union
9
+
10
+ import numpy as np
11
+ from PIL import Image
12
+ from transformers import AutoTokenizer
13
+ from transformers.feature_extraction_utils import BatchFeature
14
+ from transformers.image_processing_utils import ImageProcessingMixin
15
+ from transformers.processing_utils import ProcessorMixin
16
+ from transformers.utils import logging
17
+
18
+ logger = logging.get_logger(__name__)
19
+
20
+ # CLIP normalization constants (same as HF)
21
+ OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
22
+ OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
23
+
24
+
25
+ def smart_resize(
26
+ height: int,
27
+ width: int,
28
+ factor: int = 32,
29
+ min_pixels: int = 512 * 512,
30
+ max_pixels: int = 2048 * 2048,
31
+ ) -> Tuple[int, int]:
32
+ """Rescale image dimensions to meet constraints.
33
+
34
+ Ensures:
35
+ 1. Both dimensions are divisible by 'factor' (patch_size * merge_size)
36
+ 2. Total pixels within [min_pixels, max_pixels]
37
+ 3. Aspect ratio maintained as closely as possible
38
+
39
+ Args:
40
+ height: Original image height
41
+ width: Original image width
42
+ factor: Divisibility factor (default: patch_size * merge_size = 16 * 2 = 32)
43
+ min_pixels: Minimum total pixels
44
+ max_pixels: Maximum total pixels
45
+
46
+ Returns:
47
+ Tuple of (resized_height, resized_width)
48
+ """
49
+ if max(height, width) / min(height, width) > 200:
50
+ raise ValueError(
51
+ f"Absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
52
+ )
53
+
54
+ # Round to nearest factor
55
+ h_bar = round(height / factor) * factor
56
+ w_bar = round(width / factor) * factor
57
+
58
+ if h_bar * w_bar > max_pixels:
59
+ # Scale down to fit max_pixels
60
+ beta = math.sqrt((height * width) / max_pixels)
61
+ h_bar = max(factor, math.floor(height / beta / factor) * factor)
62
+ w_bar = max(factor, math.floor(width / beta / factor) * factor)
63
+ elif h_bar * w_bar < min_pixels:
64
+ # Scale up to meet min_pixels
65
+ beta = math.sqrt(min_pixels / (height * width))
66
+ h_bar = math.ceil(height * beta / factor) * factor
67
+ w_bar = math.ceil(width * beta / factor) * factor
68
+
69
+ return h_bar, w_bar
70
+
71
+
72
+ class HunYuanVLImageProcessor(ImageProcessingMixin):
73
+ """Image processor for HunyuanVL model.
74
+
75
+ Handles resizing, normalization, and patch extraction for images.
76
+
77
+ Note: This class inherits from ImageProcessingMixin but NOT BaseImageProcessor
78
+ to avoid automatic prepare_inputs() behavior while satisfying type checks.
79
+ """
80
+
81
+ model_input_names = ["pixel_values", "image_grid_thw"]
82
+
83
+ def __init__(
84
+ self,
85
+ min_pixels: int = 512 * 512,
86
+ max_pixels: int = 2048 * 2048,
87
+ patch_size: int = 16,
88
+ temporal_patch_size: int = 1,
89
+ merge_size: int = 2,
90
+ image_mean: Tuple[float, float, float] = OPENAI_CLIP_MEAN,
91
+ image_std: Tuple[float, float, float] = OPENAI_CLIP_STD,
92
+ do_resize: bool = True,
93
+ do_normalize: bool = True,
94
+ do_convert_rgb: bool = True,
95
+ config: Optional[Dict] = None,
96
+ **kwargs,
97
+ ):
98
+ super().__init__(**kwargs)
99
+
100
+ # Override with config values if provided
101
+ if config is not None:
102
+ vision_config = config.get("vision_config", {})
103
+ min_pixels = config.get("min_pixels", min_pixels)
104
+ max_pixels = config.get("max_pixels", max_pixels)
105
+ patch_size = vision_config.get("patch_size", patch_size)
106
+ merge_size = vision_config.get("spatial_merge_size", merge_size)
107
+
108
+ self.min_pixels = min_pixels
109
+ self.max_pixels = max_pixels
110
+ self.patch_size = patch_size
111
+ self.temporal_patch_size = temporal_patch_size
112
+ self.merge_size = merge_size
113
+ self.image_mean = image_mean
114
+ self.image_std = image_std
115
+ self.do_resize = do_resize
116
+ self.do_normalize = do_normalize
117
+ self.do_convert_rgb = do_convert_rgb
118
+
119
+ def _preprocess_single(
120
+ self,
121
+ image: Image.Image,
122
+ ) -> Tuple[np.ndarray, Tuple[int, int, int]]:
123
+ """Preprocess a single image.
124
+
125
+ Args:
126
+ image: PIL Image
127
+
128
+ Returns:
129
+ Tuple of (flattened_patches, (grid_t, grid_h, grid_w))
130
+ """
131
+ # Convert to RGB if needed
132
+ if self.do_convert_rgb and image.mode != "RGB":
133
+ image = image.convert("RGB")
134
+
135
+ width, height = image.size
136
+ resized_width, resized_height = width, height
137
+
138
+ # Resize to meet constraints
139
+ if self.do_resize:
140
+ factor = self.patch_size * self.merge_size
141
+ resized_height, resized_width = smart_resize(
142
+ height,
143
+ width,
144
+ factor=factor,
145
+ min_pixels=self.min_pixels,
146
+ max_pixels=self.max_pixels,
147
+ )
148
+ image = image.resize((resized_width, resized_height), Image.BILINEAR)
149
+
150
+ # Convert to numpy array and normalize
151
+ img_array = np.array(image).astype(np.float32) / 255.0
152
+
153
+ if self.do_normalize:
154
+ mean = np.array(self.image_mean).reshape(1, 1, 3)
155
+ std = np.array(self.image_std).reshape(1, 1, 3)
156
+ img_array = (img_array - mean) / std
157
+
158
+ # Transpose to CHW format
159
+ img_array = img_array.transpose(2, 0, 1) # (C, H, W)
160
+
161
+ # Calculate grid dimensions
162
+ grid_h = resized_height // self.patch_size
163
+ grid_w = resized_width // self.patch_size
164
+ grid_t = 1 # temporal dimension (always 1 for images)
165
+
166
+ # Reshape to patches
167
+ # Shape: (C, H, W) -> (C, grid_h, merge_size, patch_size, grid_w, merge_size, patch_size)
168
+ channel = img_array.shape[0]
169
+ patches = img_array.reshape(
170
+ channel,
171
+ grid_h // self.merge_size,
172
+ self.merge_size,
173
+ self.patch_size,
174
+ grid_w // self.merge_size,
175
+ self.merge_size,
176
+ self.patch_size,
177
+ )
178
+
179
+ # Transpose and flatten
180
+ # Target: (num_patches, C * patch_size * patch_size)
181
+ patches = patches.transpose(
182
+ 1, 2, 4, 5, 0, 3, 6
183
+ ) # (gh/m, m, gw/m, m, C, ps, ps)
184
+ flatten_patches = patches.reshape(
185
+ grid_h * grid_w,
186
+ channel * self.patch_size * self.patch_size,
187
+ )
188
+
189
+ return flatten_patches, (grid_t, grid_h, grid_w)
190
+
191
+ def preprocess(
192
+ self,
193
+ images: Union[Image.Image, List[Image.Image]],
194
+ **kwargs,
195
+ ) -> Dict[str, np.ndarray]:
196
+ """Preprocess one or more images.
197
+
198
+ Args:
199
+ images: Single PIL Image or list of PIL Images
200
+
201
+ Returns:
202
+ Dictionary with:
203
+ - pixel_values: (total_patches, C * patch_size * patch_size)
204
+ - image_grid_thw: (num_images, 3) with [temporal, height, width] grids
205
+ """
206
+ if isinstance(images, Image.Image):
207
+ images = [images]
208
+
209
+ all_patches = []
210
+ all_grids = []
211
+
212
+ for image in images:
213
+ patches, grid_thw = self._preprocess_single(image)
214
+ all_patches.append(patches)
215
+ all_grids.append(grid_thw)
216
+
217
+ # Stack patches from all images
218
+ pixel_values = np.concatenate(all_patches, axis=0)
219
+ image_grid_thw = np.array(all_grids)
220
+
221
+ return {
222
+ "pixel_values": pixel_values,
223
+ "image_grid_thw": image_grid_thw,
224
+ }
225
+
226
+ def __call__(
227
+ self,
228
+ images: Union[Image.Image, List[Image.Image]],
229
+ **kwargs,
230
+ ) -> BatchFeature:
231
+ """Process images and return BatchFeature."""
232
+ data = self.preprocess(images, **kwargs)
233
+ return BatchFeature(data=data)
234
+
235
+ def get_number_of_image_patches(
236
+ self,
237
+ height: int,
238
+ width: int,
239
+ **kwargs,
240
+ ) -> int:
241
+ """Calculate number of image tokens for given dimensions.
242
+
243
+ Token count formula: patch_h * (patch_w + 1) + 2
244
+ - patch_h = grid_h / merge_size
245
+ - patch_w = grid_w / merge_size
246
+ - +1 per row for newline token
247
+ - +2 for begin/end tokens
248
+
249
+ Args:
250
+ height: Image height
251
+ width: Image width
252
+
253
+ Returns:
254
+ Number of image tokens
255
+ """
256
+ factor = self.patch_size * self.merge_size
257
+ resized_height, resized_width = smart_resize(
258
+ height,
259
+ width,
260
+ factor=factor,
261
+ min_pixels=self.min_pixels,
262
+ max_pixels=self.max_pixels,
263
+ )
264
+
265
+ grid_h = resized_height // self.patch_size
266
+ grid_w = resized_width // self.patch_size
267
+
268
+ # Token count: patch_h * (patch_w + 1) + 2
269
+ patch_h = grid_h // self.merge_size
270
+ patch_w = grid_w // self.merge_size
271
+
272
+ return patch_h * (patch_w + 1) + 2
273
+
274
+ @classmethod
275
+ def from_dict(cls, config_dict, **kwargs):
276
+ """Constructs an image processor from a config dictionary."""
277
+ if "vision_config" not in config_dict:
278
+ config_dict["vision_config"] = {}
279
+ return cls(config=config_dict, **kwargs)
280
+
281
+ @classmethod
282
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
283
+ """Constructs an image processor from a pretrained model."""
284
+ return cls(**kwargs)
285
+
286
+
287
+ class HunYuanVLProcessor(ProcessorMixin):
288
+ """Processor for HunyuanVL that combines image processing and tokenization.
289
+
290
+ Handles:
291
+ - Image preprocessing via HunYuanVLImageProcessor
292
+ - Token replacement for image placeholders
293
+ - 4D position_ids construction for xdrope
294
+ """
295
+
296
+ attributes = ["image_processor", "tokenizer"]
297
+ valid_kwargs = ["chat_template"]
298
+ image_processor_class = "AutoImageProcessor"
299
+ tokenizer_class = "AutoTokenizer"
300
+
301
+ # Special token IDs (from HunyuanVL config)
302
+ IMAGE_TOKEN_ID = 120120
303
+ IM_START_TOKEN_ID = 120118
304
+ IM_END_TOKEN_ID = 120119
305
+ PAD_TOKEN_ID = 120002
306
+
307
+ def __init__(
308
+ self,
309
+ image_processor=None,
310
+ tokenizer=None,
311
+ chat_template=None,
312
+ **kwargs,
313
+ ):
314
+ if image_processor is None:
315
+ image_processor = HunYuanVLImageProcessor(**kwargs)
316
+
317
+ self.tokenizer = tokenizer
318
+ self.image_processor = image_processor
319
+
320
+ # Get special token IDs
321
+ self.image_token_id = self.IMAGE_TOKEN_ID
322
+ self.im_start_token_id = self.IM_START_TOKEN_ID
323
+ self.im_end_token_id = self.IM_END_TOKEN_ID
324
+ self.pad_id = self.PAD_TOKEN_ID
325
+
326
+ # Get token strings from tokenizer
327
+ if tokenizer is not None:
328
+ self.image_token = tokenizer.convert_ids_to_tokens(self.image_token_id)
329
+ self.im_start_token = tokenizer.convert_ids_to_tokens(
330
+ self.im_start_token_id
331
+ )
332
+ self.im_end_token = tokenizer.convert_ids_to_tokens(self.im_end_token_id)
333
+ self.placeholder_token = tokenizer.convert_ids_to_tokens(
334
+ tokenizer.vocab_size - 1
335
+ )
336
+ else:
337
+ self.image_token = "<image>"
338
+ self.im_start_token = "<im_start>"
339
+ self.im_end_token = "<im_end>"
340
+ self.placeholder_token = "<placeholder>"
341
+
342
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
343
+
344
+ def __call__(
345
+ self,
346
+ images=None,
347
+ text: Union[str, List[str]] = None,
348
+ videos=None,
349
+ **kwargs,
350
+ ) -> BatchFeature:
351
+ """Process images and text for the model.
352
+
353
+ Args:
354
+ images: Single image or list of images
355
+ text: Single text or list of texts
356
+ videos: Video inputs (not currently supported)
357
+ **kwargs: Additional arguments passed to tokenizer
358
+
359
+ Returns:
360
+ BatchFeature with:
361
+ - input_ids: Token IDs with image placeholders replaced
362
+ - attention_mask: Attention mask
363
+ - pixel_values: Processed image patches
364
+ - image_grid_thw: Grid dimensions for each image
365
+ - position_ids: 4D position IDs for xdrope
366
+ """
367
+ image_inputs = {}
368
+ videos_inputs = {}
369
+
370
+ if images is not None:
371
+ image_inputs = self.image_processor(images=images)
372
+ image_grid_thw = image_inputs["image_grid_thw"]
373
+
374
+ if text is None:
375
+ text = [""]
376
+ elif not isinstance(text, list):
377
+ text = [text]
378
+
379
+ text = [t for t in text] # Copy to avoid modifying original
380
+
381
+ # Track cumulative image token positions
382
+ image_tokens_cumsum = [0]
383
+
384
+ if images is not None:
385
+ index = 0
386
+ for i in range(len(text)):
387
+ while self.image_token in text[i]:
388
+ grid_h, grid_w = image_grid_thw[index][-2:]
389
+ patch_h = grid_h // self.image_processor.merge_size
390
+ patch_w = grid_w // self.image_processor.merge_size
391
+ num_image_tokens = patch_h * (patch_w + 1) + 2
392
+ image_tokens_cumsum.append(
393
+ image_tokens_cumsum[-1] + num_image_tokens
394
+ )
395
+ text[i] = text[i].replace(
396
+ self.image_token,
397
+ self.placeholder_token * num_image_tokens,
398
+ 1,
399
+ )
400
+ index += 1
401
+ text[i] = text[i].replace(self.placeholder_token, self.image_token)
402
+
403
+ # Pop return_tensors to handle it ourselves at the end
404
+ return_tensors = kwargs.pop("return_tensors", None)
405
+
406
+ # Tokenize text
407
+ text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
408
+
409
+ # Get input_ids and convert to numpy array for processing
410
+ input_ids = text_inputs["input_ids"]
411
+ if hasattr(input_ids, "tolist"):
412
+ # Handle mlx arrays or torch tensors
413
+ input_ids = np.array(input_ids.tolist())
414
+ elif isinstance(input_ids, list):
415
+ input_ids = np.array(input_ids)
416
+
417
+ text_inputs["input_ids"] = input_ids
418
+ seq_len = input_ids.shape[-1]
419
+
420
+ # Build 4D position_ids for xdrope
421
+ # Shape: (1, 4, seq_len) where 4 = [base, w, h, t]
422
+ position_ids = np.arange(seq_len)
423
+ position_ids_w = np.arange(seq_len)
424
+ position_ids_h = np.arange(seq_len)
425
+ position_ids_t = np.arange(seq_len)
426
+
427
+ if images is not None:
428
+ # Find image token positions
429
+ image_token_pos_indices = np.where(input_ids[0] == self.image_token_id)[0]
430
+
431
+ for i in range(len(image_grid_thw)):
432
+ grid_h, grid_w = image_grid_thw[i][-2:]
433
+ patch_h = grid_h // self.image_processor.merge_size
434
+ patch_w = grid_w // self.image_processor.merge_size
435
+
436
+ # Start position for this image's tokens (skip begin token)
437
+ start_pos = image_token_pos_indices[image_tokens_cumsum[i]].item() + 1
438
+ replace_num = (patch_w + 1) * patch_h
439
+
440
+ # Set width positions: 0, 1, 2, ..., patch_w, 0, 1, 2, ..., patch_w, ...
441
+ position_ids_w[start_pos : start_pos + replace_num] = np.array(
442
+ list(range(patch_w + 1)) * patch_h
443
+ )
444
+
445
+ # Set height positions: 0, 0, ..., 0, 1, 1, ..., 1, ...
446
+ patch_h_list = []
447
+ for h in range(patch_h):
448
+ patch_h_list += [h] * (patch_w + 1)
449
+ position_ids_h[start_pos : start_pos + replace_num] = np.array(
450
+ patch_h_list
451
+ )
452
+
453
+ # Set temporal positions: all 0 for images
454
+ position_ids_t[start_pos : start_pos + replace_num] = 0
455
+
456
+ # Stack position_ids: (1, 4, seq_len)
457
+ # Order: base, w, h, t
458
+ position_ids = np.stack(
459
+ [position_ids, position_ids_w, position_ids_h, position_ids_t]
460
+ )[np.newaxis, ...]
461
+
462
+ text_inputs["position_ids"] = position_ids
463
+
464
+ # Build attention mask
465
+ attention_mask = (input_ids != self.pad_id).astype(np.int64)
466
+ text_inputs["attention_mask"] = attention_mask
467
+
468
+ # Get image positions
469
+ text_inputs["imgs_pos"] = [self.get_imgs_pos(input_ids[0])]
470
+
471
+ return BatchFeature(
472
+ data={**text_inputs, **image_inputs, **videos_inputs},
473
+ tensor_type=return_tensors,
474
+ )
475
+
476
+ def batch_decode(self, *args, **kwargs):
477
+ """Decode token IDs to text."""
478
+ return self.tokenizer.batch_decode(*args, **kwargs)
479
+
480
+ def decode(self, *args, **kwargs):
481
+ """Decode token IDs to text."""
482
+ return self.tokenizer.decode(*args, **kwargs)
483
+
484
+ def apply_chat_template(self, *args, **kwargs):
485
+ """Apply chat template using the tokenizer."""
486
+ return self.tokenizer.apply_chat_template(*args, **kwargs)
487
+
488
+ def get_imgs_pos(self, doc_ids):
489
+ """Get image positions from document token IDs.
490
+
491
+ Args:
492
+ doc_ids: Token IDs array
493
+
494
+ Returns:
495
+ List of [start, end] positions for each image
496
+ """
497
+ doc_ids = np.array(doc_ids, dtype=np.int64)
498
+ img_begin_index = np.where(doc_ids == self.im_start_token_id)[0]
499
+ img_end_index = np.where(doc_ids == self.im_end_token_id)[0]
500
+ imgs_pos = np.concatenate(
501
+ (
502
+ np.reshape(img_begin_index + 1, (-1, 1)),
503
+ np.reshape(img_end_index, (-1, 1)),
504
+ ),
505
+ axis=-1,
506
+ ).tolist()
507
+ return imgs_pos
508
+
509
+ @property
510
+ def model_input_names(self):
511
+ """Return combined input names from tokenizer and image processor."""
512
+ tokenizer_input_names = (
513
+ self.tokenizer.model_input_names if self.tokenizer else []
514
+ )
515
+ image_processor_input_names = self.image_processor.model_input_names
516
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
517
+
518
+ @classmethod
519
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
520
+ """Load processor from pretrained model path."""
521
+ trust_remote_code = kwargs.pop("trust_remote_code", True)
522
+
523
+ tokenizer = AutoTokenizer.from_pretrained(
524
+ pretrained_model_name_or_path,
525
+ trust_remote_code=trust_remote_code,
526
+ **kwargs,
527
+ )
528
+ image_processor = HunYuanVLImageProcessor(**kwargs)
529
+ return cls(image_processor=image_processor, tokenizer=tokenizer, **kwargs)
530
+
531
+
532
+ def split_image_into_patch_blocks(
533
+ pixel_values: np.ndarray, # shape: [batch_size, 3, H, W]
534
+ patch_size: int = 16,
535
+ adaptor_patch_div: int = 4,
536
+ ) -> np.ndarray:
537
+ """Split the input image array into large patches and then smaller regions.
538
+
539
+ Split the input image tensor (supporting batch) into large patches of size `patch_size`,
540
+ and then further divide each large patch into smaller regions of size
541
+ (patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div).
542
+ Each small region is extracted as a tensor of shape [3, patch_size, patch_size].
543
+ The final output contains all such small region tensors.
544
+
545
+ Args:
546
+ pixel_values: Input image array of shape [batch_size, 3, H, W].
547
+ patch_size: Size of the large patch, e.g., 16.
548
+ adaptor_patch_div: Each large patch is divided into
549
+ (patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div)
550
+ smaller regions.
551
+
552
+ Returns:
553
+ patches: An array of shape [N, 3, patch_size, patch_size],
554
+ where N = batch_size * (H // patch_size) * (W // patch_size) * (patch_size // adaptor_patch_div)^2.
555
+ Each element in the batch corresponds to one small image region.
556
+ """
557
+ batch_size, channels, height, width = pixel_values.shape
558
+ assert channels == 3, "Pixel values must have 3 channels in dim=1"
559
+ assert (
560
+ height % patch_size == 0 and width % patch_size == 0
561
+ ), "H and W must be divisible by patch_size"
562
+
563
+ patch_height_num = height // patch_size
564
+ patch_width_num = width // patch_size
565
+
566
+ # Reshape to [B, 3, ph, ps, pw, ps]
567
+ img = pixel_values.reshape(
568
+ batch_size,
569
+ 3,
570
+ patch_height_num,
571
+ patch_size,
572
+ patch_width_num,
573
+ patch_size,
574
+ )
575
+
576
+ # Further split each psxps patch into (ps//aps)x(ps//aps) small regions
577
+ img = img.reshape(
578
+ batch_size,
579
+ 3,
580
+ patch_height_num,
581
+ patch_size // adaptor_patch_div,
582
+ adaptor_patch_div,
583
+ patch_width_num,
584
+ patch_size // adaptor_patch_div,
585
+ adaptor_patch_div,
586
+ )
587
+
588
+ # Permute to group the small regions: [B, ph, pw, ps//aps, ps//aps, 3, aps, aps]
589
+ img = img.transpose(0, 2, 5, 3, 6, 1, 4, 7)
590
+
591
+ # Reshape into [B * ph * pw * (ps//aps)^2, 3, patch_size, patch_size]
592
+ patches = img.reshape(-1, 3, patch_size, patch_size)
593
+
594
+ return patches
595
+
596
+
597
+ # Alias for compatibility
598
+ ImageProcessor = HunYuanVLImageProcessor
599
+
600
+
601
+ __all__ = [
602
+ "HunYuanVLImageProcessor",
603
+ "HunYuanVLProcessor",
604
+ "ImageProcessor",
605
+ "smart_resize",
606
+ "split_image_into_patch_blocks",
607
+ ]