fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (258) hide show
  1. fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
  2. fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
  3. fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
  4. fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
  5. fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
  6. fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
  7. mlx_vlm/__init__.py +16 -0
  8. mlx_vlm/__main__.py +24 -0
  9. mlx_vlm/chat.py +234 -0
  10. mlx_vlm/chat_ui.py +508 -0
  11. mlx_vlm/convert.py +284 -0
  12. mlx_vlm/deprecation.py +52 -0
  13. mlx_vlm/evals/__init__.py +0 -0
  14. mlx_vlm/evals/math_vista.py +565 -0
  15. mlx_vlm/evals/mmmu.py +528 -0
  16. mlx_vlm/evals/mmstar.py +343 -0
  17. mlx_vlm/evals/ocrbench.py +453 -0
  18. mlx_vlm/evals/utils.py +37 -0
  19. mlx_vlm/generate.py +1457 -0
  20. mlx_vlm/lora.py +207 -0
  21. mlx_vlm/models/__init__.py +0 -0
  22. mlx_vlm/models/aya_vision/__init__.py +2 -0
  23. mlx_vlm/models/aya_vision/aya_vision.py +188 -0
  24. mlx_vlm/models/aya_vision/config.py +52 -0
  25. mlx_vlm/models/aya_vision/language.py +202 -0
  26. mlx_vlm/models/aya_vision/vision.py +340 -0
  27. mlx_vlm/models/base.py +356 -0
  28. mlx_vlm/models/cache.py +238 -0
  29. mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
  30. mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
  31. mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
  32. mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
  33. mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
  34. mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
  35. mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
  36. mlx_vlm/models/deepseekocr/__init__.py +2 -0
  37. mlx_vlm/models/deepseekocr/config.py +173 -0
  38. mlx_vlm/models/deepseekocr/conversation.py +264 -0
  39. mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
  40. mlx_vlm/models/deepseekocr/language.py +547 -0
  41. mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
  42. mlx_vlm/models/deepseekocr/sam.py +489 -0
  43. mlx_vlm/models/deepseekocr/vision.py +263 -0
  44. mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
  45. mlx_vlm/models/deepseekocr_2/config.py +216 -0
  46. mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
  47. mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
  48. mlx_vlm/models/deepseekocr_2/vision.py +439 -0
  49. mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
  50. mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
  51. mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
  52. mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
  53. mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
  54. mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
  55. mlx_vlm/models/fastvlm/__init__.py +2 -0
  56. mlx_vlm/models/fastvlm/config.py +79 -0
  57. mlx_vlm/models/fastvlm/fastvlm.py +198 -0
  58. mlx_vlm/models/fastvlm/language.py +49 -0
  59. mlx_vlm/models/fastvlm/vision.py +692 -0
  60. mlx_vlm/models/florence2/__init__.py +2 -0
  61. mlx_vlm/models/florence2/config.py +84 -0
  62. mlx_vlm/models/florence2/florence2.py +383 -0
  63. mlx_vlm/models/florence2/language.py +452 -0
  64. mlx_vlm/models/florence2/processing_florence2.py +30 -0
  65. mlx_vlm/models/florence2/vision.py +552 -0
  66. mlx_vlm/models/gemma3/__init__.py +2 -0
  67. mlx_vlm/models/gemma3/config.py +52 -0
  68. mlx_vlm/models/gemma3/gemma3.py +194 -0
  69. mlx_vlm/models/gemma3/language.py +293 -0
  70. mlx_vlm/models/gemma3/vision.py +215 -0
  71. mlx_vlm/models/gemma3n/__init__.py +2 -0
  72. mlx_vlm/models/gemma3n/audio.py +1038 -0
  73. mlx_vlm/models/gemma3n/config.py +130 -0
  74. mlx_vlm/models/gemma3n/gemma3n.py +322 -0
  75. mlx_vlm/models/gemma3n/language.py +631 -0
  76. mlx_vlm/models/gemma3n/vision.py +994 -0
  77. mlx_vlm/models/glm4v/__init__.py +3 -0
  78. mlx_vlm/models/glm4v/config.py +79 -0
  79. mlx_vlm/models/glm4v/glm4v.py +188 -0
  80. mlx_vlm/models/glm4v/language.py +574 -0
  81. mlx_vlm/models/glm4v/processing.py +220 -0
  82. mlx_vlm/models/glm4v/vision.py +406 -0
  83. mlx_vlm/models/glm4v_moe/__init__.py +3 -0
  84. mlx_vlm/models/glm4v_moe/config.py +81 -0
  85. mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
  86. mlx_vlm/models/glm4v_moe/language.py +674 -0
  87. mlx_vlm/models/glm4v_moe/processing.py +229 -0
  88. mlx_vlm/models/glm4v_moe/vision.py +405 -0
  89. mlx_vlm/models/glm_ocr/__init__.py +3 -0
  90. mlx_vlm/models/glm_ocr/config.py +93 -0
  91. mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
  92. mlx_vlm/models/glm_ocr/language.py +585 -0
  93. mlx_vlm/models/glm_ocr/processing.py +208 -0
  94. mlx_vlm/models/glm_ocr/vision.py +342 -0
  95. mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
  96. mlx_vlm/models/hunyuan_vl/config.py +136 -0
  97. mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
  98. mlx_vlm/models/hunyuan_vl/language.py +509 -0
  99. mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
  100. mlx_vlm/models/hunyuan_vl/vision.py +322 -0
  101. mlx_vlm/models/idefics2/__init__.py +2 -0
  102. mlx_vlm/models/idefics2/config.py +65 -0
  103. mlx_vlm/models/idefics2/idefics2.py +321 -0
  104. mlx_vlm/models/idefics2/language.py +161 -0
  105. mlx_vlm/models/idefics2/vision.py +244 -0
  106. mlx_vlm/models/idefics3/__init__.py +4 -0
  107. mlx_vlm/models/idefics3/config.py +54 -0
  108. mlx_vlm/models/idefics3/idefics3.py +221 -0
  109. mlx_vlm/models/idefics3/language.py +157 -0
  110. mlx_vlm/models/idefics3/vision.py +265 -0
  111. mlx_vlm/models/internvl_chat/__init__.py +3 -0
  112. mlx_vlm/models/internvl_chat/config.py +89 -0
  113. mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
  114. mlx_vlm/models/internvl_chat/language.py +187 -0
  115. mlx_vlm/models/internvl_chat/processor.py +395 -0
  116. mlx_vlm/models/internvl_chat/vision.py +265 -0
  117. mlx_vlm/models/interpolate.py +183 -0
  118. mlx_vlm/models/jina_vlm/__init__.py +3 -0
  119. mlx_vlm/models/jina_vlm/config.py +142 -0
  120. mlx_vlm/models/jina_vlm/image_processor.py +430 -0
  121. mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
  122. mlx_vlm/models/jina_vlm/language.py +272 -0
  123. mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
  124. mlx_vlm/models/jina_vlm/vision.py +202 -0
  125. mlx_vlm/models/kernels.py +447 -0
  126. mlx_vlm/models/kimi_vl/__init__.py +4 -0
  127. mlx_vlm/models/kimi_vl/config.py +84 -0
  128. mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
  129. mlx_vlm/models/kimi_vl/language.py +460 -0
  130. mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
  131. mlx_vlm/models/kimi_vl/vision.py +485 -0
  132. mlx_vlm/models/lfm2_vl/__init__.py +2 -0
  133. mlx_vlm/models/lfm2_vl/config.py +94 -0
  134. mlx_vlm/models/lfm2_vl/language.py +49 -0
  135. mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
  136. mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
  137. mlx_vlm/models/lfm2_vl/vision.py +223 -0
  138. mlx_vlm/models/llama4/__init__.py +2 -0
  139. mlx_vlm/models/llama4/config.py +83 -0
  140. mlx_vlm/models/llama4/language.py +334 -0
  141. mlx_vlm/models/llama4/llama4.py +146 -0
  142. mlx_vlm/models/llama4/vision.py +526 -0
  143. mlx_vlm/models/llava/__init__.py +2 -0
  144. mlx_vlm/models/llava/config.py +61 -0
  145. mlx_vlm/models/llava/language.py +200 -0
  146. mlx_vlm/models/llava/llava.py +132 -0
  147. mlx_vlm/models/llava/vision.py +233 -0
  148. mlx_vlm/models/llava_bunny/__init__.py +2 -0
  149. mlx_vlm/models/llava_bunny/config.py +85 -0
  150. mlx_vlm/models/llava_bunny/language.py +194 -0
  151. mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
  152. mlx_vlm/models/llava_bunny/vision.py +278 -0
  153. mlx_vlm/models/llava_next/__init__.py +2 -0
  154. mlx_vlm/models/llava_next/config.py +60 -0
  155. mlx_vlm/models/llava_next/language.py +192 -0
  156. mlx_vlm/models/llava_next/llava_next.py +138 -0
  157. mlx_vlm/models/llava_next/vision.py +217 -0
  158. mlx_vlm/models/mistral3/__init__.py +2 -0
  159. mlx_vlm/models/mistral3/config.py +59 -0
  160. mlx_vlm/models/mistral3/language.py +269 -0
  161. mlx_vlm/models/mistral3/mistral3.py +383 -0
  162. mlx_vlm/models/mllama/__init__.py +4 -0
  163. mlx_vlm/models/mllama/config.py +74 -0
  164. mlx_vlm/models/mllama/language.py +377 -0
  165. mlx_vlm/models/mllama/mllama.py +210 -0
  166. mlx_vlm/models/mllama/vision.py +458 -0
  167. mlx_vlm/models/molmo/__init__.py +5 -0
  168. mlx_vlm/models/molmo/config.py +93 -0
  169. mlx_vlm/models/molmo/language.py +208 -0
  170. mlx_vlm/models/molmo/molmo.py +108 -0
  171. mlx_vlm/models/molmo/processing_molmo.py +763 -0
  172. mlx_vlm/models/molmo/vision.py +408 -0
  173. mlx_vlm/models/molmo2/__init__.py +6 -0
  174. mlx_vlm/models/molmo2/config.py +137 -0
  175. mlx_vlm/models/molmo2/language.py +206 -0
  176. mlx_vlm/models/molmo2/molmo2.py +330 -0
  177. mlx_vlm/models/molmo2/processing.py +773 -0
  178. mlx_vlm/models/molmo2/vision.py +286 -0
  179. mlx_vlm/models/moondream2/__init__.py +11 -0
  180. mlx_vlm/models/moondream2/config.py +92 -0
  181. mlx_vlm/models/moondream2/image_crops.py +269 -0
  182. mlx_vlm/models/moondream2/language.py +267 -0
  183. mlx_vlm/models/moondream2/moondream2.py +522 -0
  184. mlx_vlm/models/moondream2/processing_moondream.py +144 -0
  185. mlx_vlm/models/moondream2/vision.py +200 -0
  186. mlx_vlm/models/multi_modality/__init__.py +4 -0
  187. mlx_vlm/models/multi_modality/config.py +108 -0
  188. mlx_vlm/models/multi_modality/language.py +191 -0
  189. mlx_vlm/models/multi_modality/multi_modality.py +338 -0
  190. mlx_vlm/models/multi_modality/sam.py +543 -0
  191. mlx_vlm/models/multi_modality/vision.py +450 -0
  192. mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
  193. mlx_vlm/models/paddleocr_vl/config.py +93 -0
  194. mlx_vlm/models/paddleocr_vl/language.py +522 -0
  195. mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
  196. mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
  197. mlx_vlm/models/paddleocr_vl/vision.py +358 -0
  198. mlx_vlm/models/paligemma/__init__.py +4 -0
  199. mlx_vlm/models/paligemma/config.py +50 -0
  200. mlx_vlm/models/paligemma/language.py +253 -0
  201. mlx_vlm/models/paligemma/paligemma.py +140 -0
  202. mlx_vlm/models/paligemma/vision.py +218 -0
  203. mlx_vlm/models/phi3_v/__init__.py +5 -0
  204. mlx_vlm/models/phi3_v/config.py +55 -0
  205. mlx_vlm/models/phi3_v/language.py +2 -0
  206. mlx_vlm/models/phi3_v/phi3_v.py +239 -0
  207. mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
  208. mlx_vlm/models/phi3_v/vision.py +294 -0
  209. mlx_vlm/models/pixtral/__init__.py +4 -0
  210. mlx_vlm/models/pixtral/config.py +69 -0
  211. mlx_vlm/models/pixtral/language.py +195 -0
  212. mlx_vlm/models/pixtral/pixtral.py +208 -0
  213. mlx_vlm/models/pixtral/vision.py +293 -0
  214. mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
  215. mlx_vlm/models/qwen2_5_vl/config.py +90 -0
  216. mlx_vlm/models/qwen2_5_vl/language.py +541 -0
  217. mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
  218. mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
  219. mlx_vlm/models/qwen2_vl/__init__.py +2 -0
  220. mlx_vlm/models/qwen2_vl/config.py +86 -0
  221. mlx_vlm/models/qwen2_vl/language.py +539 -0
  222. mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
  223. mlx_vlm/models/qwen2_vl/vision.py +308 -0
  224. mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
  225. mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
  226. mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
  227. mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
  228. mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
  229. mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
  230. mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
  231. mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
  232. mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
  233. mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
  234. mlx_vlm/models/qwen3_vl/__init__.py +2 -0
  235. mlx_vlm/models/qwen3_vl/config.py +103 -0
  236. mlx_vlm/models/qwen3_vl/language.py +596 -0
  237. mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
  238. mlx_vlm/models/qwen3_vl/vision.py +441 -0
  239. mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
  240. mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
  241. mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
  242. mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
  243. mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
  244. mlx_vlm/models/smolvlm/__init__.py +4 -0
  245. mlx_vlm/models/smolvlm/config.py +59 -0
  246. mlx_vlm/models/smolvlm/smolvlm.py +60 -0
  247. mlx_vlm/prompt_utils.py +565 -0
  248. mlx_vlm/sample_utils.py +39 -0
  249. mlx_vlm/server.py +1107 -0
  250. mlx_vlm/smolvlm_video_generate.py +109 -0
  251. mlx_vlm/tokenizer_utils.py +371 -0
  252. mlx_vlm/trainer/__init__.py +9 -0
  253. mlx_vlm/trainer/lora.py +70 -0
  254. mlx_vlm/trainer/trainer.py +299 -0
  255. mlx_vlm/trainer/utils.py +160 -0
  256. mlx_vlm/utils.py +1339 -0
  257. mlx_vlm/version.py +1 -0
  258. mlx_vlm/video_generate.py +611 -0
@@ -0,0 +1,704 @@
1
+ """
2
+ MLX-based Phi3V Processor.
3
+
4
+ This module provides an MLX-native processor for Phi-3.5-Vision models that:
5
+ 1. Uses HuggingFace tokenizer (no custom dependencies)
6
+ 2. Provides an MLX-based image processor (no torch/torchvision dependency)
7
+ 3. Handles dynamic resolution with HD image processing
8
+ """
9
+
10
+ import json
11
+ import math
12
+ import re
13
+ import warnings
14
+ from pathlib import Path
15
+ from typing import List, Optional, Tuple, Union
16
+
17
+ import mlx.core as mx
18
+ import numpy as np
19
+ import transformers.processing_utils as processing_utils
20
+ from PIL import Image
21
+ from transformers import AutoTokenizer
22
+ from transformers.feature_extraction_utils import BatchFeature
23
+ from transformers.image_processing_utils import BaseImageProcessor
24
+ from transformers.image_utils import ImageInput, make_list_of_images, valid_images
25
+ from transformers.processing_utils import ProcessorMixin
26
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
27
+ from transformers.utils import TensorType
28
+
29
+
30
+ def _validate_images_text_input_order(images, text):
31
+ """
32
+ Validate and potentially swap the order of images and text arguments.
33
+ """
34
+ if images is not None and text is not None:
35
+ images_is_text = isinstance(images, str) or (
36
+ isinstance(images, (list, tuple))
37
+ and len(images) > 0
38
+ and isinstance(images[0], str)
39
+ )
40
+ text_is_image = not isinstance(text, str) and not (
41
+ isinstance(text, (list, tuple))
42
+ and len(text) > 0
43
+ and isinstance(text[0], str)
44
+ )
45
+
46
+ if images_is_text and text_is_image:
47
+ warnings.warn(
48
+ "You passed text as the first argument and images as the second. "
49
+ "This is deprecated and will be removed in a future version. "
50
+ "Please pass images first and text second.",
51
+ FutureWarning,
52
+ )
53
+ return text, images
54
+
55
+ return images, text
56
+
57
+
58
+ # Add the function to transformers.processing_utils if it doesn't exist
59
+ if not hasattr(processing_utils, "_validate_images_text_input_order"):
60
+ processing_utils._validate_images_text_input_order = (
61
+ _validate_images_text_input_order
62
+ )
63
+
64
+ # Also add Unpack if it doesn't exist (for older Python versions)
65
+ if not hasattr(processing_utils, "Unpack"):
66
+ try:
67
+ from typing import Unpack
68
+
69
+ processing_utils.Unpack = Unpack
70
+ except ImportError:
71
+ from typing_extensions import Unpack
72
+
73
+ processing_utils.Unpack = Unpack
74
+
75
+
76
+ # CLIP-style normalization constants (same as OpenAI CLIP)
77
+ OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
78
+ OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
79
+
80
+
81
+ def _calc_padded_size(width: int, height: int, padding_unit: int = 336):
82
+ """Calculate the padded size to be divisible by padding_unit."""
83
+ target_height = math.ceil(height / padding_unit) * padding_unit
84
+ target_width = math.ceil(width / padding_unit) * padding_unit
85
+ return target_width, target_height
86
+
87
+
88
+ def _calc_hd_transform_size(width: int, height: int, hd_num: int = 16):
89
+ """
90
+ Calculate the HD transform size for dynamic resolution.
91
+ Phi-3.5 uses a 336x336 base size and supports up to hd_num tiles.
92
+ """
93
+ transposed = False
94
+ if width < height:
95
+ width, height = height, width
96
+ transposed = True
97
+
98
+ ratio = width / height
99
+ scale = 1
100
+ while scale * math.ceil(scale / ratio) <= hd_num:
101
+ scale += 1
102
+ scale -= 1
103
+
104
+ new_width = int(scale * 336)
105
+ new_height = int(new_width / ratio)
106
+
107
+ # Make dimensions divisible by 336
108
+ padded_width, padded_height = _calc_padded_size(new_width, new_height, 336)
109
+
110
+ if transposed:
111
+ padded_width, padded_height = padded_height, padded_width
112
+
113
+ return padded_width, padded_height
114
+
115
+
116
+ def _hd_transform(img: Image.Image, hd_num: int = 16) -> Image.Image:
117
+ """
118
+ Apply HD transform to resize image for dynamic resolution.
119
+ """
120
+ width, height = img.size
121
+ target_width, target_height = _calc_hd_transform_size(width, height, hd_num)
122
+ return img.resize((target_width, target_height), Image.Resampling.BICUBIC)
123
+
124
+
125
+ def _pad_to_336(img: Image.Image) -> Image.Image:
126
+ """
127
+ Pad image dimensions to be divisible by 336.
128
+ """
129
+ width, height = img.size
130
+ target_width = math.ceil(width / 336) * 336
131
+ target_height = math.ceil(height / 336) * 336
132
+
133
+ if target_width == width and target_height == height:
134
+ return img
135
+
136
+ # Create new image with black background
137
+ new_img = Image.new("RGB", (target_width, target_height), (0, 0, 0))
138
+ new_img.paste(img, (0, 0))
139
+ return new_img
140
+
141
+
142
+ class Phi3VImageProcessor(BaseImageProcessor):
143
+ """
144
+ Image processor for Phi-3.5-Vision models.
145
+
146
+ Processes images using HD dynamic resolution with 336x336 tiles,
147
+ similar to the official Phi-3.5-Vision implementation.
148
+ """
149
+
150
+ model_input_names = ["pixel_values", "image_sizes"]
151
+
152
+ def __init__(
153
+ self,
154
+ image_mean: Tuple[float, float, float] = OPENAI_CLIP_MEAN,
155
+ image_std: Tuple[float, float, float] = OPENAI_CLIP_STD,
156
+ num_crops: int = 4,
157
+ num_img_tokens: int = 144,
158
+ **kwargs,
159
+ ):
160
+ super().__init__(**kwargs)
161
+ self.image_mean = image_mean
162
+ self.image_std = image_std
163
+ self.num_crops = num_crops
164
+ self.num_img_tokens = num_img_tokens
165
+ self.img_size = 336
166
+
167
+ def calc_num_image_tokens(self, image: Image.Image) -> int:
168
+ """
169
+ Calculate the number of image tokens for a given image.
170
+ """
171
+ width, height = image.size
172
+ hd_width, hd_height = _calc_hd_transform_size(width, height, self.num_crops)
173
+ num_h_tiles = hd_height // self.img_size
174
+ num_w_tiles = hd_width // self.img_size
175
+ # Global image tokens + sub-image tokens + separators
176
+ num_tokens = (
177
+ (num_h_tiles * num_w_tiles + 1) * self.num_img_tokens
178
+ + 1
179
+ + (num_h_tiles + 1) * 12
180
+ )
181
+ return num_tokens
182
+
183
+ def _process_single_image(
184
+ self, image: Image.Image
185
+ ) -> Tuple[np.ndarray, Tuple[int, int]]:
186
+ """
187
+ Process a single image with HD transform and normalize.
188
+
189
+ Returns:
190
+ pixel_values: numpy array of shape (num_tiles + 1, C, H, W)
191
+ image_size: (height, width) of the HD transformed image
192
+ """
193
+ # Ensure RGB
194
+ if image.mode != "RGB":
195
+ image = image.convert("RGB")
196
+
197
+ # Apply HD transform
198
+ hd_image = _hd_transform(image, self.num_crops)
199
+ hd_image = _pad_to_336(hd_image)
200
+ hd_width, hd_height = hd_image.size
201
+
202
+ # Create global image (resized to 336x336)
203
+ global_image = hd_image.resize(
204
+ (self.img_size, self.img_size), Image.Resampling.BICUBIC
205
+ )
206
+
207
+ # Split HD image into 336x336 tiles
208
+ num_h_tiles = hd_height // self.img_size
209
+ num_w_tiles = hd_width // self.img_size
210
+
211
+ tiles = []
212
+ for h in range(num_h_tiles):
213
+ for w in range(num_w_tiles):
214
+ left = w * self.img_size
215
+ top = h * self.img_size
216
+ right = left + self.img_size
217
+ bottom = top + self.img_size
218
+ tile = hd_image.crop((left, top, right, bottom))
219
+ tiles.append(tile)
220
+
221
+ # Global image first, then tiles
222
+ all_images = [global_image] + tiles
223
+
224
+ # Convert to numpy arrays and normalize
225
+ processed = []
226
+ for img in all_images:
227
+ arr = np.array(img, dtype=np.float32) / 255.0
228
+ # Normalize
229
+ arr = (arr - np.array(self.image_mean)) / np.array(self.image_std)
230
+ # HWC to CHW
231
+ arr = arr.transpose(2, 0, 1)
232
+ processed.append(arr)
233
+
234
+ pixel_values = np.stack(processed, axis=0) # (num_tiles + 1, C, H, W)
235
+ image_size = (hd_height, hd_width)
236
+
237
+ return pixel_values, image_size
238
+
239
+ def preprocess(
240
+ self,
241
+ images: ImageInput,
242
+ return_tensors: Optional[Union[str, TensorType]] = None,
243
+ **kwargs,
244
+ ) -> BatchFeature:
245
+ """Process images and return BatchFeature."""
246
+ images = make_list_of_images(images)
247
+
248
+ if not valid_images(images):
249
+ raise ValueError(
250
+ "Invalid image type. Must be of type PIL.Image.Image or similar."
251
+ )
252
+
253
+ all_pixel_values = []
254
+ all_image_sizes = []
255
+
256
+ for image in images:
257
+ # Convert to PIL if needed
258
+ if isinstance(image, np.ndarray):
259
+ image = Image.fromarray(image)
260
+
261
+ pixel_values, image_size = self._process_single_image(image)
262
+ all_pixel_values.append(pixel_values)
263
+ all_image_sizes.append(image_size)
264
+
265
+ # Stack with padding to handle variable number of tiles
266
+ max_tiles = max(pv.shape[0] for pv in all_pixel_values)
267
+ batch_size = len(all_pixel_values)
268
+
269
+ # Pad to same number of tiles
270
+ padded_pixel_values = []
271
+ for pv in all_pixel_values:
272
+ if pv.shape[0] < max_tiles:
273
+ padding = np.zeros(
274
+ (max_tiles - pv.shape[0], *pv.shape[1:]), dtype=pv.dtype
275
+ )
276
+ pv = np.concatenate([pv, padding], axis=0)
277
+ padded_pixel_values.append(pv)
278
+
279
+ pixel_values = np.stack(padded_pixel_values, axis=0) # (B, T, C, H, W)
280
+ image_sizes = np.array(all_image_sizes) # (B, 2)
281
+
282
+ data = {
283
+ "pixel_values": mx.array(pixel_values),
284
+ "image_sizes": mx.array(image_sizes),
285
+ }
286
+
287
+ return BatchFeature(data=data, tensor_type=return_tensors)
288
+
289
+ def __call__(
290
+ self,
291
+ images: ImageInput,
292
+ return_tensors: Optional[Union[str, TensorType]] = None,
293
+ **kwargs,
294
+ ) -> BatchFeature:
295
+ """Make the image processor callable."""
296
+ return self.preprocess(images, return_tensors=return_tensors, **kwargs)
297
+
298
+
299
+ class Phi3VProcessor(ProcessorMixin):
300
+ """
301
+ MLX-based processor for Phi-3.5-Vision that doesn't require torch/torchvision.
302
+
303
+ Constructs a Phi3V processor which wraps a Phi3V image processor and a tokenizer
304
+ into a single processor.
305
+ """
306
+
307
+ attributes = ["image_processor", "tokenizer"]
308
+ valid_kwargs = ["chat_template"]
309
+ image_processor_class = "Phi3VImageProcessor"
310
+ tokenizer_class = "AutoTokenizer"
311
+
312
+ def __init__(
313
+ self,
314
+ image_processor=None,
315
+ tokenizer=None,
316
+ chat_template=None,
317
+ **kwargs,
318
+ ):
319
+ if image_processor is None:
320
+ image_processor = Phi3VImageProcessor()
321
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
322
+
323
+ def _convert_images_texts_to_inputs(
324
+ self,
325
+ images: List[Image.Image],
326
+ texts: str,
327
+ padding: bool = False,
328
+ truncation: bool = None,
329
+ max_length: int = None,
330
+ ) -> BatchFeature:
331
+ """
332
+ Convert images and text to model inputs, replacing image tokens with negative IDs.
333
+
334
+ The Phi3V model expects image tokens to be represented as negative values in input_ids.
335
+ For example, <|image_1|> becomes a sequence of -1 values, <|image_2|> becomes -2 values.
336
+ """
337
+ # Pattern to match image tokens like <|image_1|>, <|image_2|>, etc.
338
+ pattern = r"<\|image_\d+\|>"
339
+
340
+ # Process images first to get their sizes and calculate token counts
341
+ if images:
342
+ images = make_list_of_images(images)
343
+ pil_images = []
344
+ for img in images:
345
+ if isinstance(img, np.ndarray):
346
+ img = Image.fromarray(img)
347
+ if img.mode != "RGB":
348
+ img = img.convert("RGB")
349
+ pil_images.append(img)
350
+
351
+ # Calculate number of tokens for each image
352
+ num_img_tokens = [
353
+ self.image_processor.calc_num_image_tokens(img) for img in pil_images
354
+ ]
355
+
356
+ # Process images through image processor
357
+ image_inputs = self.image_processor(pil_images)
358
+ else:
359
+ pil_images = []
360
+ num_img_tokens = []
361
+ image_inputs = {}
362
+
363
+ # Find image tags and extract their IDs
364
+ image_tags = re.findall(pattern, texts)
365
+
366
+ if image_tags:
367
+ # Extract image IDs from tags (e.g., <|image_1|> -> 1)
368
+ image_ids = [int(tag.split("|")[1].split("_")[-1]) for tag in image_tags]
369
+
370
+ # Validate: unique image IDs should be sequential starting from 1
371
+ unique_ids = sorted(set(image_ids))
372
+ if unique_ids != list(range(1, len(unique_ids) + 1)):
373
+ raise ValueError(
374
+ f"Image IDs must be sequential starting from 1. Got: {unique_ids}"
375
+ )
376
+
377
+ # Validate: number of unique image IDs should match number of images
378
+ if len(unique_ids) != len(pil_images):
379
+ raise ValueError(
380
+ f"Number of image tags ({len(unique_ids)}) doesn't match "
381
+ f"number of images ({len(pil_images)})"
382
+ )
383
+
384
+ # Create padded negative IDs for each image tag
385
+ # Each <|image_N|> is replaced with num_img_tokens[N-1] copies of -N
386
+ image_ids_pad = [[-iid] * num_img_tokens[iid - 1] for iid in image_ids]
387
+
388
+ # Split text by image pattern and tokenize each chunk
389
+ text_chunks = texts.split("<|image_")
390
+
391
+ # Reconstruct the split to handle the pattern properly
392
+ prompt_chunks = []
393
+ for i, chunk in enumerate(re.split(pattern, texts)):
394
+ tokens = self.tokenizer.encode(chunk, add_special_tokens=(i == 0))
395
+ prompt_chunks.append(tokens)
396
+
397
+ # Interleave text chunks with image token sequences
398
+ input_ids = []
399
+ img_idx = 0
400
+ for i, chunk in enumerate(prompt_chunks):
401
+ # Add text tokens (skip BOS if not first chunk)
402
+ offset = 0 if i == 0 else 1 # Skip BOS token for subsequent chunks
403
+ if i > 0 and len(chunk) > 0 and chunk[0] == self.tokenizer.bos_token_id:
404
+ offset = 1
405
+ input_ids.extend(chunk[offset:])
406
+
407
+ # Add image tokens if there's a corresponding image
408
+ if img_idx < len(image_ids_pad):
409
+ input_ids.extend(image_ids_pad[img_idx])
410
+ img_idx += 1
411
+ else:
412
+ # No image tokens, just tokenize normally
413
+ input_ids = self.tokenizer.encode(texts)
414
+
415
+ # Create attention mask (all tokens including negative IDs are attended to)
416
+ attention_mask = [1] * len(input_ids)
417
+
418
+ text_inputs = {
419
+ "input_ids": mx.array([input_ids]),
420
+ "attention_mask": mx.array([attention_mask]),
421
+ }
422
+
423
+ return BatchFeature(data={**text_inputs, **image_inputs})
424
+
425
+ def __call__(
426
+ self,
427
+ images: ImageInput = None,
428
+ text: Union[
429
+ TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
430
+ ] = None,
431
+ **kwargs,
432
+ ) -> BatchFeature:
433
+ """
434
+ Main method to prepare for the model one or several sequences(s) and image(s).
435
+
436
+ Args:
437
+ images: The image or batch of images to be prepared.
438
+ text: The sequence or batch of sequences to be encoded.
439
+ return_tensors: If set, will return tensors of a particular framework.
440
+
441
+ Returns:
442
+ BatchFeature with input_ids, attention_mask, pixel_values, and image_sizes.
443
+ """
444
+ if images is None and text is None:
445
+ raise ValueError("You have to specify at least one of `images` or `text`.")
446
+
447
+ # Check if images and text inputs are reversed for BC
448
+ images, text = _validate_images_text_input_order(images, text)
449
+
450
+ # Extract return_tensors from kwargs (unused, we always return MLX arrays)
451
+ kwargs.pop("return_tensors", None)
452
+ padding = kwargs.pop("padding", False)
453
+ truncation = kwargs.pop("truncation", None)
454
+ max_length = kwargs.pop("max_length", None)
455
+
456
+ # Convert to list if single text
457
+ if isinstance(text, str):
458
+ texts = [text]
459
+ elif text is not None:
460
+ texts = list(text)
461
+ else:
462
+ texts = None
463
+
464
+ # Convert images to list if needed
465
+ if images is not None:
466
+ if not isinstance(images, list):
467
+ images = [images]
468
+ else:
469
+ images = []
470
+
471
+ # Process images and text together (handles image token replacement)
472
+ if texts is not None:
473
+ # For now, handle single text input (batching can be added later)
474
+ if len(texts) == 1:
475
+ return self._convert_images_texts_to_inputs(
476
+ images=images,
477
+ texts=texts[0],
478
+ padding=padding,
479
+ truncation=truncation,
480
+ max_length=max_length,
481
+ )
482
+ else:
483
+ # Batch processing: process each text separately and combine
484
+ all_input_ids = []
485
+ all_attention_masks = []
486
+ all_pixel_values = []
487
+ all_image_sizes = []
488
+
489
+ for txt in texts:
490
+ result = self._convert_images_texts_to_inputs(
491
+ images=images,
492
+ texts=txt,
493
+ padding=padding,
494
+ truncation=truncation,
495
+ max_length=max_length,
496
+ )
497
+ all_input_ids.append(result["input_ids"][0].tolist())
498
+ all_attention_masks.append(result["attention_mask"][0].tolist())
499
+ if "pixel_values" in result:
500
+ all_pixel_values.append(result["pixel_values"])
501
+ if "image_sizes" in result:
502
+ all_image_sizes.append(result["image_sizes"])
503
+
504
+ # Pad input_ids and attention_masks to same length
505
+ max_len = max(len(ids) for ids in all_input_ids)
506
+ pad_token_id = self.tokenizer.pad_token_id or 0
507
+
508
+ padded_input_ids = []
509
+ padded_attention_masks = []
510
+ for ids, mask in zip(all_input_ids, all_attention_masks):
511
+ padding_length = max_len - len(ids)
512
+ padded_input_ids.append(ids + [pad_token_id] * padding_length)
513
+ padded_attention_masks.append(mask + [0] * padding_length)
514
+
515
+ data = {
516
+ "input_ids": mx.array(padded_input_ids),
517
+ "attention_mask": mx.array(padded_attention_masks),
518
+ }
519
+
520
+ if all_pixel_values:
521
+ data["pixel_values"] = all_pixel_values[
522
+ 0
523
+ ] # Same images for all texts
524
+ if all_image_sizes:
525
+ data["image_sizes"] = all_image_sizes[0]
526
+
527
+ return BatchFeature(data=data)
528
+
529
+ # Text-only case
530
+ if images:
531
+ image_inputs = self.image_processor(images)
532
+ else:
533
+ image_inputs = {}
534
+
535
+ return BatchFeature(data=image_inputs)
536
+
537
+ def batch_decode(self, *args, **kwargs):
538
+ """Forward to tokenizer's batch_decode."""
539
+ return self.tokenizer.batch_decode(*args, **kwargs)
540
+
541
+ def decode(self, *args, **kwargs):
542
+ """Forward to tokenizer's decode."""
543
+ return self.tokenizer.decode(*args, **kwargs)
544
+
545
+ def apply_chat_template(
546
+ self,
547
+ conversation,
548
+ chat_template=None,
549
+ add_generation_prompt=False,
550
+ tokenize=False,
551
+ **kwargs,
552
+ ):
553
+ """Apply chat template to the conversation."""
554
+ if chat_template is None:
555
+ chat_template = self.chat_template
556
+ if chat_template is None:
557
+ chat_template = getattr(self.tokenizer, "chat_template", None)
558
+
559
+ if chat_template is None:
560
+ raise ValueError(
561
+ "No chat template found. Please provide a chat_template argument "
562
+ "or ensure the tokenizer has a chat_template attribute."
563
+ )
564
+
565
+ try:
566
+ from jinja2 import Template
567
+ except ImportError:
568
+ raise ImportError("jinja2 is required for apply_chat_template")
569
+
570
+ template = Template(chat_template)
571
+ rendered = template.render(
572
+ messages=conversation,
573
+ add_generation_prompt=add_generation_prompt,
574
+ **kwargs,
575
+ )
576
+
577
+ if tokenize:
578
+ return self.tokenizer.encode(rendered)
579
+ return rendered
580
+
581
+ @property
582
+ def model_input_names(self):
583
+ """Get the model input names from tokenizer and image processor."""
584
+ tokenizer_input_names = self.tokenizer.model_input_names
585
+ image_processor_input_names = self.image_processor.model_input_names
586
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
587
+
588
+ @classmethod
589
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
590
+ """Load the processor from a pretrained model path."""
591
+ from huggingface_hub import hf_hub_download
592
+
593
+ kwargs.pop("trust_remote_code", None)
594
+
595
+ model_path = Path(pretrained_model_name_or_path)
596
+ is_local = model_path.exists() and model_path.is_dir()
597
+
598
+ tokenizer = AutoTokenizer.from_pretrained(
599
+ str(model_path) if is_local else pretrained_model_name_or_path,
600
+ trust_remote_code=True,
601
+ local_files_only=is_local,
602
+ )
603
+
604
+ # Load image processor config
605
+ image_processor_config = {}
606
+ try:
607
+ if is_local:
608
+ config_path = model_path / "preprocessor_config.json"
609
+ else:
610
+ config_path = Path(
611
+ hf_hub_download(
612
+ pretrained_model_name_or_path, "preprocessor_config.json"
613
+ )
614
+ )
615
+ if config_path.exists():
616
+ with open(config_path, "r", encoding="utf-8") as f:
617
+ preprocessor_config = json.load(f)
618
+ if "num_crops" in preprocessor_config:
619
+ image_processor_config["num_crops"] = preprocessor_config[
620
+ "num_crops"
621
+ ]
622
+ if "num_img_tokens" in preprocessor_config:
623
+ image_processor_config["num_img_tokens"] = preprocessor_config[
624
+ "num_img_tokens"
625
+ ]
626
+ if "image_mean" in preprocessor_config:
627
+ image_processor_config["image_mean"] = tuple(
628
+ preprocessor_config["image_mean"]
629
+ )
630
+ if "image_std" in preprocessor_config:
631
+ image_processor_config["image_std"] = tuple(
632
+ preprocessor_config["image_std"]
633
+ )
634
+ except Exception:
635
+ pass
636
+
637
+ image_processor = Phi3VImageProcessor(**image_processor_config)
638
+
639
+ # Load chat template from jinja file if not already set on tokenizer
640
+ chat_template = getattr(tokenizer, "chat_template", None)
641
+ if chat_template is None:
642
+ try:
643
+ if is_local:
644
+ jinja_path = model_path / "chat_template.jinja"
645
+ else:
646
+ jinja_path = Path(
647
+ hf_hub_download(
648
+ pretrained_model_name_or_path, "chat_template.jinja"
649
+ )
650
+ )
651
+ if jinja_path.exists():
652
+ chat_template = jinja_path.read_text(encoding="utf-8")
653
+ tokenizer.chat_template = chat_template
654
+ except Exception:
655
+ pass
656
+
657
+ return cls(
658
+ image_processor=image_processor,
659
+ tokenizer=tokenizer,
660
+ chat_template=chat_template,
661
+ )
662
+
663
+
664
+ # Register the processor with AutoProcessor
665
+ from transformers import AutoProcessor
666
+
667
+ _original_auto_processor_from_pretrained = AutoProcessor.from_pretrained
668
+
669
+
670
+ @classmethod
671
+ def _patched_auto_processor_from_pretrained(
672
+ cls, pretrained_model_name_or_path, **kwargs
673
+ ):
674
+ """Patched from_pretrained that returns Phi3VProcessor for phi3_v models."""
675
+ from huggingface_hub import hf_hub_download
676
+
677
+ model_path = Path(pretrained_model_name_or_path)
678
+ is_local = model_path.exists() and model_path.is_dir()
679
+
680
+ # Check if this is a phi3_v model
681
+ is_phi3_v = False
682
+ try:
683
+ if is_local:
684
+ config_path = model_path / "config.json"
685
+ else:
686
+ config_path = Path(
687
+ hf_hub_download(pretrained_model_name_or_path, "config.json")
688
+ )
689
+ with open(config_path, "r", encoding="utf-8") as f:
690
+ config = json.load(f)
691
+ model_type = config.get("model_type", "").lower()
692
+ is_phi3_v = model_type in ("phi3_v", "phi3-v", "phi3v")
693
+ except Exception:
694
+ pass
695
+
696
+ if is_phi3_v:
697
+ return Phi3VProcessor.from_pretrained(pretrained_model_name_or_path, **kwargs)
698
+
699
+ return _original_auto_processor_from_pretrained.__func__(
700
+ cls, pretrained_model_name_or_path, **kwargs
701
+ )
702
+
703
+
704
+ AutoProcessor.from_pretrained = _patched_auto_processor_from_pretrained