fount-vlm-nell-02 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
- fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
- fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
- fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
- fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
- fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
- mlx_vlm/__init__.py +16 -0
- mlx_vlm/__main__.py +24 -0
- mlx_vlm/chat.py +234 -0
- mlx_vlm/chat_ui.py +508 -0
- mlx_vlm/convert.py +284 -0
- mlx_vlm/deprecation.py +52 -0
- mlx_vlm/evals/__init__.py +0 -0
- mlx_vlm/evals/math_vista.py +565 -0
- mlx_vlm/evals/mmmu.py +528 -0
- mlx_vlm/evals/mmstar.py +343 -0
- mlx_vlm/evals/ocrbench.py +453 -0
- mlx_vlm/evals/utils.py +37 -0
- mlx_vlm/generate.py +1457 -0
- mlx_vlm/lora.py +207 -0
- mlx_vlm/models/__init__.py +0 -0
- mlx_vlm/models/aya_vision/__init__.py +2 -0
- mlx_vlm/models/aya_vision/aya_vision.py +188 -0
- mlx_vlm/models/aya_vision/config.py +52 -0
- mlx_vlm/models/aya_vision/language.py +202 -0
- mlx_vlm/models/aya_vision/vision.py +340 -0
- mlx_vlm/models/base.py +356 -0
- mlx_vlm/models/cache.py +238 -0
- mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
- mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
- mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
- mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
- mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
- mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
- mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
- mlx_vlm/models/deepseekocr/__init__.py +2 -0
- mlx_vlm/models/deepseekocr/config.py +173 -0
- mlx_vlm/models/deepseekocr/conversation.py +264 -0
- mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
- mlx_vlm/models/deepseekocr/language.py +547 -0
- mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
- mlx_vlm/models/deepseekocr/sam.py +489 -0
- mlx_vlm/models/deepseekocr/vision.py +263 -0
- mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
- mlx_vlm/models/deepseekocr_2/config.py +216 -0
- mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
- mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
- mlx_vlm/models/deepseekocr_2/vision.py +439 -0
- mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
- mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
- mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
- mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
- mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
- mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
- mlx_vlm/models/fastvlm/__init__.py +2 -0
- mlx_vlm/models/fastvlm/config.py +79 -0
- mlx_vlm/models/fastvlm/fastvlm.py +198 -0
- mlx_vlm/models/fastvlm/language.py +49 -0
- mlx_vlm/models/fastvlm/vision.py +692 -0
- mlx_vlm/models/florence2/__init__.py +2 -0
- mlx_vlm/models/florence2/config.py +84 -0
- mlx_vlm/models/florence2/florence2.py +383 -0
- mlx_vlm/models/florence2/language.py +452 -0
- mlx_vlm/models/florence2/processing_florence2.py +30 -0
- mlx_vlm/models/florence2/vision.py +552 -0
- mlx_vlm/models/gemma3/__init__.py +2 -0
- mlx_vlm/models/gemma3/config.py +52 -0
- mlx_vlm/models/gemma3/gemma3.py +194 -0
- mlx_vlm/models/gemma3/language.py +293 -0
- mlx_vlm/models/gemma3/vision.py +215 -0
- mlx_vlm/models/gemma3n/__init__.py +2 -0
- mlx_vlm/models/gemma3n/audio.py +1038 -0
- mlx_vlm/models/gemma3n/config.py +130 -0
- mlx_vlm/models/gemma3n/gemma3n.py +322 -0
- mlx_vlm/models/gemma3n/language.py +631 -0
- mlx_vlm/models/gemma3n/vision.py +994 -0
- mlx_vlm/models/glm4v/__init__.py +3 -0
- mlx_vlm/models/glm4v/config.py +79 -0
- mlx_vlm/models/glm4v/glm4v.py +188 -0
- mlx_vlm/models/glm4v/language.py +574 -0
- mlx_vlm/models/glm4v/processing.py +220 -0
- mlx_vlm/models/glm4v/vision.py +406 -0
- mlx_vlm/models/glm4v_moe/__init__.py +3 -0
- mlx_vlm/models/glm4v_moe/config.py +81 -0
- mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
- mlx_vlm/models/glm4v_moe/language.py +674 -0
- mlx_vlm/models/glm4v_moe/processing.py +229 -0
- mlx_vlm/models/glm4v_moe/vision.py +405 -0
- mlx_vlm/models/glm_ocr/__init__.py +3 -0
- mlx_vlm/models/glm_ocr/config.py +93 -0
- mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
- mlx_vlm/models/glm_ocr/language.py +585 -0
- mlx_vlm/models/glm_ocr/processing.py +208 -0
- mlx_vlm/models/glm_ocr/vision.py +342 -0
- mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
- mlx_vlm/models/hunyuan_vl/config.py +136 -0
- mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
- mlx_vlm/models/hunyuan_vl/language.py +509 -0
- mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
- mlx_vlm/models/hunyuan_vl/vision.py +322 -0
- mlx_vlm/models/idefics2/__init__.py +2 -0
- mlx_vlm/models/idefics2/config.py +65 -0
- mlx_vlm/models/idefics2/idefics2.py +321 -0
- mlx_vlm/models/idefics2/language.py +161 -0
- mlx_vlm/models/idefics2/vision.py +244 -0
- mlx_vlm/models/idefics3/__init__.py +4 -0
- mlx_vlm/models/idefics3/config.py +54 -0
- mlx_vlm/models/idefics3/idefics3.py +221 -0
- mlx_vlm/models/idefics3/language.py +157 -0
- mlx_vlm/models/idefics3/vision.py +265 -0
- mlx_vlm/models/internvl_chat/__init__.py +3 -0
- mlx_vlm/models/internvl_chat/config.py +89 -0
- mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
- mlx_vlm/models/internvl_chat/language.py +187 -0
- mlx_vlm/models/internvl_chat/processor.py +395 -0
- mlx_vlm/models/internvl_chat/vision.py +265 -0
- mlx_vlm/models/interpolate.py +183 -0
- mlx_vlm/models/jina_vlm/__init__.py +3 -0
- mlx_vlm/models/jina_vlm/config.py +142 -0
- mlx_vlm/models/jina_vlm/image_processor.py +430 -0
- mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
- mlx_vlm/models/jina_vlm/language.py +272 -0
- mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
- mlx_vlm/models/jina_vlm/vision.py +202 -0
- mlx_vlm/models/kernels.py +447 -0
- mlx_vlm/models/kimi_vl/__init__.py +4 -0
- mlx_vlm/models/kimi_vl/config.py +84 -0
- mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
- mlx_vlm/models/kimi_vl/language.py +460 -0
- mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
- mlx_vlm/models/kimi_vl/vision.py +485 -0
- mlx_vlm/models/lfm2_vl/__init__.py +2 -0
- mlx_vlm/models/lfm2_vl/config.py +94 -0
- mlx_vlm/models/lfm2_vl/language.py +49 -0
- mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
- mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
- mlx_vlm/models/lfm2_vl/vision.py +223 -0
- mlx_vlm/models/llama4/__init__.py +2 -0
- mlx_vlm/models/llama4/config.py +83 -0
- mlx_vlm/models/llama4/language.py +334 -0
- mlx_vlm/models/llama4/llama4.py +146 -0
- mlx_vlm/models/llama4/vision.py +526 -0
- mlx_vlm/models/llava/__init__.py +2 -0
- mlx_vlm/models/llava/config.py +61 -0
- mlx_vlm/models/llava/language.py +200 -0
- mlx_vlm/models/llava/llava.py +132 -0
- mlx_vlm/models/llava/vision.py +233 -0
- mlx_vlm/models/llava_bunny/__init__.py +2 -0
- mlx_vlm/models/llava_bunny/config.py +85 -0
- mlx_vlm/models/llava_bunny/language.py +194 -0
- mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
- mlx_vlm/models/llava_bunny/vision.py +278 -0
- mlx_vlm/models/llava_next/__init__.py +2 -0
- mlx_vlm/models/llava_next/config.py +60 -0
- mlx_vlm/models/llava_next/language.py +192 -0
- mlx_vlm/models/llava_next/llava_next.py +138 -0
- mlx_vlm/models/llava_next/vision.py +217 -0
- mlx_vlm/models/mistral3/__init__.py +2 -0
- mlx_vlm/models/mistral3/config.py +59 -0
- mlx_vlm/models/mistral3/language.py +269 -0
- mlx_vlm/models/mistral3/mistral3.py +383 -0
- mlx_vlm/models/mllama/__init__.py +4 -0
- mlx_vlm/models/mllama/config.py +74 -0
- mlx_vlm/models/mllama/language.py +377 -0
- mlx_vlm/models/mllama/mllama.py +210 -0
- mlx_vlm/models/mllama/vision.py +458 -0
- mlx_vlm/models/molmo/__init__.py +5 -0
- mlx_vlm/models/molmo/config.py +93 -0
- mlx_vlm/models/molmo/language.py +208 -0
- mlx_vlm/models/molmo/molmo.py +108 -0
- mlx_vlm/models/molmo/processing_molmo.py +763 -0
- mlx_vlm/models/molmo/vision.py +408 -0
- mlx_vlm/models/molmo2/__init__.py +6 -0
- mlx_vlm/models/molmo2/config.py +137 -0
- mlx_vlm/models/molmo2/language.py +206 -0
- mlx_vlm/models/molmo2/molmo2.py +330 -0
- mlx_vlm/models/molmo2/processing.py +773 -0
- mlx_vlm/models/molmo2/vision.py +286 -0
- mlx_vlm/models/moondream2/__init__.py +11 -0
- mlx_vlm/models/moondream2/config.py +92 -0
- mlx_vlm/models/moondream2/image_crops.py +269 -0
- mlx_vlm/models/moondream2/language.py +267 -0
- mlx_vlm/models/moondream2/moondream2.py +522 -0
- mlx_vlm/models/moondream2/processing_moondream.py +144 -0
- mlx_vlm/models/moondream2/vision.py +200 -0
- mlx_vlm/models/multi_modality/__init__.py +4 -0
- mlx_vlm/models/multi_modality/config.py +108 -0
- mlx_vlm/models/multi_modality/language.py +191 -0
- mlx_vlm/models/multi_modality/multi_modality.py +338 -0
- mlx_vlm/models/multi_modality/sam.py +543 -0
- mlx_vlm/models/multi_modality/vision.py +450 -0
- mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
- mlx_vlm/models/paddleocr_vl/config.py +93 -0
- mlx_vlm/models/paddleocr_vl/language.py +522 -0
- mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
- mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
- mlx_vlm/models/paddleocr_vl/vision.py +358 -0
- mlx_vlm/models/paligemma/__init__.py +4 -0
- mlx_vlm/models/paligemma/config.py +50 -0
- mlx_vlm/models/paligemma/language.py +253 -0
- mlx_vlm/models/paligemma/paligemma.py +140 -0
- mlx_vlm/models/paligemma/vision.py +218 -0
- mlx_vlm/models/phi3_v/__init__.py +5 -0
- mlx_vlm/models/phi3_v/config.py +55 -0
- mlx_vlm/models/phi3_v/language.py +2 -0
- mlx_vlm/models/phi3_v/phi3_v.py +239 -0
- mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
- mlx_vlm/models/phi3_v/vision.py +294 -0
- mlx_vlm/models/pixtral/__init__.py +4 -0
- mlx_vlm/models/pixtral/config.py +69 -0
- mlx_vlm/models/pixtral/language.py +195 -0
- mlx_vlm/models/pixtral/pixtral.py +208 -0
- mlx_vlm/models/pixtral/vision.py +293 -0
- mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
- mlx_vlm/models/qwen2_5_vl/config.py +90 -0
- mlx_vlm/models/qwen2_5_vl/language.py +541 -0
- mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
- mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
- mlx_vlm/models/qwen2_vl/__init__.py +2 -0
- mlx_vlm/models/qwen2_vl/config.py +86 -0
- mlx_vlm/models/qwen2_vl/language.py +539 -0
- mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
- mlx_vlm/models/qwen2_vl/vision.py +308 -0
- mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
- mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
- mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
- mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
- mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
- mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
- mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
- mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
- mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
- mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
- mlx_vlm/models/qwen3_vl/__init__.py +2 -0
- mlx_vlm/models/qwen3_vl/config.py +103 -0
- mlx_vlm/models/qwen3_vl/language.py +596 -0
- mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
- mlx_vlm/models/qwen3_vl/vision.py +441 -0
- mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
- mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
- mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
- mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
- mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
- mlx_vlm/models/smolvlm/__init__.py +4 -0
- mlx_vlm/models/smolvlm/config.py +59 -0
- mlx_vlm/models/smolvlm/smolvlm.py +60 -0
- mlx_vlm/prompt_utils.py +565 -0
- mlx_vlm/sample_utils.py +39 -0
- mlx_vlm/server.py +1107 -0
- mlx_vlm/smolvlm_video_generate.py +109 -0
- mlx_vlm/tokenizer_utils.py +371 -0
- mlx_vlm/trainer/__init__.py +9 -0
- mlx_vlm/trainer/lora.py +70 -0
- mlx_vlm/trainer/trainer.py +299 -0
- mlx_vlm/trainer/utils.py +160 -0
- mlx_vlm/utils.py +1339 -0
- mlx_vlm/version.py +1 -0
- mlx_vlm/video_generate.py +611 -0
|
@@ -0,0 +1,704 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MLX-based Phi3V Processor.
|
|
3
|
+
|
|
4
|
+
This module provides an MLX-native processor for Phi-3.5-Vision models that:
|
|
5
|
+
1. Uses HuggingFace tokenizer (no custom dependencies)
|
|
6
|
+
2. Provides an MLX-based image processor (no torch/torchvision dependency)
|
|
7
|
+
3. Handles dynamic resolution with HD image processing
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import math
|
|
12
|
+
import re
|
|
13
|
+
import warnings
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import List, Optional, Tuple, Union
|
|
16
|
+
|
|
17
|
+
import mlx.core as mx
|
|
18
|
+
import numpy as np
|
|
19
|
+
import transformers.processing_utils as processing_utils
|
|
20
|
+
from PIL import Image
|
|
21
|
+
from transformers import AutoTokenizer
|
|
22
|
+
from transformers.feature_extraction_utils import BatchFeature
|
|
23
|
+
from transformers.image_processing_utils import BaseImageProcessor
|
|
24
|
+
from transformers.image_utils import ImageInput, make_list_of_images, valid_images
|
|
25
|
+
from transformers.processing_utils import ProcessorMixin
|
|
26
|
+
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
|
27
|
+
from transformers.utils import TensorType
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _validate_images_text_input_order(images, text):
|
|
31
|
+
"""
|
|
32
|
+
Validate and potentially swap the order of images and text arguments.
|
|
33
|
+
"""
|
|
34
|
+
if images is not None and text is not None:
|
|
35
|
+
images_is_text = isinstance(images, str) or (
|
|
36
|
+
isinstance(images, (list, tuple))
|
|
37
|
+
and len(images) > 0
|
|
38
|
+
and isinstance(images[0], str)
|
|
39
|
+
)
|
|
40
|
+
text_is_image = not isinstance(text, str) and not (
|
|
41
|
+
isinstance(text, (list, tuple))
|
|
42
|
+
and len(text) > 0
|
|
43
|
+
and isinstance(text[0], str)
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
if images_is_text and text_is_image:
|
|
47
|
+
warnings.warn(
|
|
48
|
+
"You passed text as the first argument and images as the second. "
|
|
49
|
+
"This is deprecated and will be removed in a future version. "
|
|
50
|
+
"Please pass images first and text second.",
|
|
51
|
+
FutureWarning,
|
|
52
|
+
)
|
|
53
|
+
return text, images
|
|
54
|
+
|
|
55
|
+
return images, text
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# Add the function to transformers.processing_utils if it doesn't exist
|
|
59
|
+
if not hasattr(processing_utils, "_validate_images_text_input_order"):
|
|
60
|
+
processing_utils._validate_images_text_input_order = (
|
|
61
|
+
_validate_images_text_input_order
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Also add Unpack if it doesn't exist (for older Python versions)
|
|
65
|
+
if not hasattr(processing_utils, "Unpack"):
|
|
66
|
+
try:
|
|
67
|
+
from typing import Unpack
|
|
68
|
+
|
|
69
|
+
processing_utils.Unpack = Unpack
|
|
70
|
+
except ImportError:
|
|
71
|
+
from typing_extensions import Unpack
|
|
72
|
+
|
|
73
|
+
processing_utils.Unpack = Unpack
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# CLIP-style normalization constants (same as OpenAI CLIP)
|
|
77
|
+
OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
|
|
78
|
+
OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _calc_padded_size(width: int, height: int, padding_unit: int = 336):
|
|
82
|
+
"""Calculate the padded size to be divisible by padding_unit."""
|
|
83
|
+
target_height = math.ceil(height / padding_unit) * padding_unit
|
|
84
|
+
target_width = math.ceil(width / padding_unit) * padding_unit
|
|
85
|
+
return target_width, target_height
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _calc_hd_transform_size(width: int, height: int, hd_num: int = 16):
|
|
89
|
+
"""
|
|
90
|
+
Calculate the HD transform size for dynamic resolution.
|
|
91
|
+
Phi-3.5 uses a 336x336 base size and supports up to hd_num tiles.
|
|
92
|
+
"""
|
|
93
|
+
transposed = False
|
|
94
|
+
if width < height:
|
|
95
|
+
width, height = height, width
|
|
96
|
+
transposed = True
|
|
97
|
+
|
|
98
|
+
ratio = width / height
|
|
99
|
+
scale = 1
|
|
100
|
+
while scale * math.ceil(scale / ratio) <= hd_num:
|
|
101
|
+
scale += 1
|
|
102
|
+
scale -= 1
|
|
103
|
+
|
|
104
|
+
new_width = int(scale * 336)
|
|
105
|
+
new_height = int(new_width / ratio)
|
|
106
|
+
|
|
107
|
+
# Make dimensions divisible by 336
|
|
108
|
+
padded_width, padded_height = _calc_padded_size(new_width, new_height, 336)
|
|
109
|
+
|
|
110
|
+
if transposed:
|
|
111
|
+
padded_width, padded_height = padded_height, padded_width
|
|
112
|
+
|
|
113
|
+
return padded_width, padded_height
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _hd_transform(img: Image.Image, hd_num: int = 16) -> Image.Image:
|
|
117
|
+
"""
|
|
118
|
+
Apply HD transform to resize image for dynamic resolution.
|
|
119
|
+
"""
|
|
120
|
+
width, height = img.size
|
|
121
|
+
target_width, target_height = _calc_hd_transform_size(width, height, hd_num)
|
|
122
|
+
return img.resize((target_width, target_height), Image.Resampling.BICUBIC)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _pad_to_336(img: Image.Image) -> Image.Image:
|
|
126
|
+
"""
|
|
127
|
+
Pad image dimensions to be divisible by 336.
|
|
128
|
+
"""
|
|
129
|
+
width, height = img.size
|
|
130
|
+
target_width = math.ceil(width / 336) * 336
|
|
131
|
+
target_height = math.ceil(height / 336) * 336
|
|
132
|
+
|
|
133
|
+
if target_width == width and target_height == height:
|
|
134
|
+
return img
|
|
135
|
+
|
|
136
|
+
# Create new image with black background
|
|
137
|
+
new_img = Image.new("RGB", (target_width, target_height), (0, 0, 0))
|
|
138
|
+
new_img.paste(img, (0, 0))
|
|
139
|
+
return new_img
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class Phi3VImageProcessor(BaseImageProcessor):
|
|
143
|
+
"""
|
|
144
|
+
Image processor for Phi-3.5-Vision models.
|
|
145
|
+
|
|
146
|
+
Processes images using HD dynamic resolution with 336x336 tiles,
|
|
147
|
+
similar to the official Phi-3.5-Vision implementation.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
model_input_names = ["pixel_values", "image_sizes"]
|
|
151
|
+
|
|
152
|
+
def __init__(
|
|
153
|
+
self,
|
|
154
|
+
image_mean: Tuple[float, float, float] = OPENAI_CLIP_MEAN,
|
|
155
|
+
image_std: Tuple[float, float, float] = OPENAI_CLIP_STD,
|
|
156
|
+
num_crops: int = 4,
|
|
157
|
+
num_img_tokens: int = 144,
|
|
158
|
+
**kwargs,
|
|
159
|
+
):
|
|
160
|
+
super().__init__(**kwargs)
|
|
161
|
+
self.image_mean = image_mean
|
|
162
|
+
self.image_std = image_std
|
|
163
|
+
self.num_crops = num_crops
|
|
164
|
+
self.num_img_tokens = num_img_tokens
|
|
165
|
+
self.img_size = 336
|
|
166
|
+
|
|
167
|
+
def calc_num_image_tokens(self, image: Image.Image) -> int:
|
|
168
|
+
"""
|
|
169
|
+
Calculate the number of image tokens for a given image.
|
|
170
|
+
"""
|
|
171
|
+
width, height = image.size
|
|
172
|
+
hd_width, hd_height = _calc_hd_transform_size(width, height, self.num_crops)
|
|
173
|
+
num_h_tiles = hd_height // self.img_size
|
|
174
|
+
num_w_tiles = hd_width // self.img_size
|
|
175
|
+
# Global image tokens + sub-image tokens + separators
|
|
176
|
+
num_tokens = (
|
|
177
|
+
(num_h_tiles * num_w_tiles + 1) * self.num_img_tokens
|
|
178
|
+
+ 1
|
|
179
|
+
+ (num_h_tiles + 1) * 12
|
|
180
|
+
)
|
|
181
|
+
return num_tokens
|
|
182
|
+
|
|
183
|
+
def _process_single_image(
|
|
184
|
+
self, image: Image.Image
|
|
185
|
+
) -> Tuple[np.ndarray, Tuple[int, int]]:
|
|
186
|
+
"""
|
|
187
|
+
Process a single image with HD transform and normalize.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
pixel_values: numpy array of shape (num_tiles + 1, C, H, W)
|
|
191
|
+
image_size: (height, width) of the HD transformed image
|
|
192
|
+
"""
|
|
193
|
+
# Ensure RGB
|
|
194
|
+
if image.mode != "RGB":
|
|
195
|
+
image = image.convert("RGB")
|
|
196
|
+
|
|
197
|
+
# Apply HD transform
|
|
198
|
+
hd_image = _hd_transform(image, self.num_crops)
|
|
199
|
+
hd_image = _pad_to_336(hd_image)
|
|
200
|
+
hd_width, hd_height = hd_image.size
|
|
201
|
+
|
|
202
|
+
# Create global image (resized to 336x336)
|
|
203
|
+
global_image = hd_image.resize(
|
|
204
|
+
(self.img_size, self.img_size), Image.Resampling.BICUBIC
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Split HD image into 336x336 tiles
|
|
208
|
+
num_h_tiles = hd_height // self.img_size
|
|
209
|
+
num_w_tiles = hd_width // self.img_size
|
|
210
|
+
|
|
211
|
+
tiles = []
|
|
212
|
+
for h in range(num_h_tiles):
|
|
213
|
+
for w in range(num_w_tiles):
|
|
214
|
+
left = w * self.img_size
|
|
215
|
+
top = h * self.img_size
|
|
216
|
+
right = left + self.img_size
|
|
217
|
+
bottom = top + self.img_size
|
|
218
|
+
tile = hd_image.crop((left, top, right, bottom))
|
|
219
|
+
tiles.append(tile)
|
|
220
|
+
|
|
221
|
+
# Global image first, then tiles
|
|
222
|
+
all_images = [global_image] + tiles
|
|
223
|
+
|
|
224
|
+
# Convert to numpy arrays and normalize
|
|
225
|
+
processed = []
|
|
226
|
+
for img in all_images:
|
|
227
|
+
arr = np.array(img, dtype=np.float32) / 255.0
|
|
228
|
+
# Normalize
|
|
229
|
+
arr = (arr - np.array(self.image_mean)) / np.array(self.image_std)
|
|
230
|
+
# HWC to CHW
|
|
231
|
+
arr = arr.transpose(2, 0, 1)
|
|
232
|
+
processed.append(arr)
|
|
233
|
+
|
|
234
|
+
pixel_values = np.stack(processed, axis=0) # (num_tiles + 1, C, H, W)
|
|
235
|
+
image_size = (hd_height, hd_width)
|
|
236
|
+
|
|
237
|
+
return pixel_values, image_size
|
|
238
|
+
|
|
239
|
+
def preprocess(
|
|
240
|
+
self,
|
|
241
|
+
images: ImageInput,
|
|
242
|
+
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
243
|
+
**kwargs,
|
|
244
|
+
) -> BatchFeature:
|
|
245
|
+
"""Process images and return BatchFeature."""
|
|
246
|
+
images = make_list_of_images(images)
|
|
247
|
+
|
|
248
|
+
if not valid_images(images):
|
|
249
|
+
raise ValueError(
|
|
250
|
+
"Invalid image type. Must be of type PIL.Image.Image or similar."
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
all_pixel_values = []
|
|
254
|
+
all_image_sizes = []
|
|
255
|
+
|
|
256
|
+
for image in images:
|
|
257
|
+
# Convert to PIL if needed
|
|
258
|
+
if isinstance(image, np.ndarray):
|
|
259
|
+
image = Image.fromarray(image)
|
|
260
|
+
|
|
261
|
+
pixel_values, image_size = self._process_single_image(image)
|
|
262
|
+
all_pixel_values.append(pixel_values)
|
|
263
|
+
all_image_sizes.append(image_size)
|
|
264
|
+
|
|
265
|
+
# Stack with padding to handle variable number of tiles
|
|
266
|
+
max_tiles = max(pv.shape[0] for pv in all_pixel_values)
|
|
267
|
+
batch_size = len(all_pixel_values)
|
|
268
|
+
|
|
269
|
+
# Pad to same number of tiles
|
|
270
|
+
padded_pixel_values = []
|
|
271
|
+
for pv in all_pixel_values:
|
|
272
|
+
if pv.shape[0] < max_tiles:
|
|
273
|
+
padding = np.zeros(
|
|
274
|
+
(max_tiles - pv.shape[0], *pv.shape[1:]), dtype=pv.dtype
|
|
275
|
+
)
|
|
276
|
+
pv = np.concatenate([pv, padding], axis=0)
|
|
277
|
+
padded_pixel_values.append(pv)
|
|
278
|
+
|
|
279
|
+
pixel_values = np.stack(padded_pixel_values, axis=0) # (B, T, C, H, W)
|
|
280
|
+
image_sizes = np.array(all_image_sizes) # (B, 2)
|
|
281
|
+
|
|
282
|
+
data = {
|
|
283
|
+
"pixel_values": mx.array(pixel_values),
|
|
284
|
+
"image_sizes": mx.array(image_sizes),
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
return BatchFeature(data=data, tensor_type=return_tensors)
|
|
288
|
+
|
|
289
|
+
def __call__(
|
|
290
|
+
self,
|
|
291
|
+
images: ImageInput,
|
|
292
|
+
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
293
|
+
**kwargs,
|
|
294
|
+
) -> BatchFeature:
|
|
295
|
+
"""Make the image processor callable."""
|
|
296
|
+
return self.preprocess(images, return_tensors=return_tensors, **kwargs)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class Phi3VProcessor(ProcessorMixin):
|
|
300
|
+
"""
|
|
301
|
+
MLX-based processor for Phi-3.5-Vision that doesn't require torch/torchvision.
|
|
302
|
+
|
|
303
|
+
Constructs a Phi3V processor which wraps a Phi3V image processor and a tokenizer
|
|
304
|
+
into a single processor.
|
|
305
|
+
"""
|
|
306
|
+
|
|
307
|
+
attributes = ["image_processor", "tokenizer"]
|
|
308
|
+
valid_kwargs = ["chat_template"]
|
|
309
|
+
image_processor_class = "Phi3VImageProcessor"
|
|
310
|
+
tokenizer_class = "AutoTokenizer"
|
|
311
|
+
|
|
312
|
+
def __init__(
|
|
313
|
+
self,
|
|
314
|
+
image_processor=None,
|
|
315
|
+
tokenizer=None,
|
|
316
|
+
chat_template=None,
|
|
317
|
+
**kwargs,
|
|
318
|
+
):
|
|
319
|
+
if image_processor is None:
|
|
320
|
+
image_processor = Phi3VImageProcessor()
|
|
321
|
+
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
|
322
|
+
|
|
323
|
+
def _convert_images_texts_to_inputs(
|
|
324
|
+
self,
|
|
325
|
+
images: List[Image.Image],
|
|
326
|
+
texts: str,
|
|
327
|
+
padding: bool = False,
|
|
328
|
+
truncation: bool = None,
|
|
329
|
+
max_length: int = None,
|
|
330
|
+
) -> BatchFeature:
|
|
331
|
+
"""
|
|
332
|
+
Convert images and text to model inputs, replacing image tokens with negative IDs.
|
|
333
|
+
|
|
334
|
+
The Phi3V model expects image tokens to be represented as negative values in input_ids.
|
|
335
|
+
For example, <|image_1|> becomes a sequence of -1 values, <|image_2|> becomes -2 values.
|
|
336
|
+
"""
|
|
337
|
+
# Pattern to match image tokens like <|image_1|>, <|image_2|>, etc.
|
|
338
|
+
pattern = r"<\|image_\d+\|>"
|
|
339
|
+
|
|
340
|
+
# Process images first to get their sizes and calculate token counts
|
|
341
|
+
if images:
|
|
342
|
+
images = make_list_of_images(images)
|
|
343
|
+
pil_images = []
|
|
344
|
+
for img in images:
|
|
345
|
+
if isinstance(img, np.ndarray):
|
|
346
|
+
img = Image.fromarray(img)
|
|
347
|
+
if img.mode != "RGB":
|
|
348
|
+
img = img.convert("RGB")
|
|
349
|
+
pil_images.append(img)
|
|
350
|
+
|
|
351
|
+
# Calculate number of tokens for each image
|
|
352
|
+
num_img_tokens = [
|
|
353
|
+
self.image_processor.calc_num_image_tokens(img) for img in pil_images
|
|
354
|
+
]
|
|
355
|
+
|
|
356
|
+
# Process images through image processor
|
|
357
|
+
image_inputs = self.image_processor(pil_images)
|
|
358
|
+
else:
|
|
359
|
+
pil_images = []
|
|
360
|
+
num_img_tokens = []
|
|
361
|
+
image_inputs = {}
|
|
362
|
+
|
|
363
|
+
# Find image tags and extract their IDs
|
|
364
|
+
image_tags = re.findall(pattern, texts)
|
|
365
|
+
|
|
366
|
+
if image_tags:
|
|
367
|
+
# Extract image IDs from tags (e.g., <|image_1|> -> 1)
|
|
368
|
+
image_ids = [int(tag.split("|")[1].split("_")[-1]) for tag in image_tags]
|
|
369
|
+
|
|
370
|
+
# Validate: unique image IDs should be sequential starting from 1
|
|
371
|
+
unique_ids = sorted(set(image_ids))
|
|
372
|
+
if unique_ids != list(range(1, len(unique_ids) + 1)):
|
|
373
|
+
raise ValueError(
|
|
374
|
+
f"Image IDs must be sequential starting from 1. Got: {unique_ids}"
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
# Validate: number of unique image IDs should match number of images
|
|
378
|
+
if len(unique_ids) != len(pil_images):
|
|
379
|
+
raise ValueError(
|
|
380
|
+
f"Number of image tags ({len(unique_ids)}) doesn't match "
|
|
381
|
+
f"number of images ({len(pil_images)})"
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# Create padded negative IDs for each image tag
|
|
385
|
+
# Each <|image_N|> is replaced with num_img_tokens[N-1] copies of -N
|
|
386
|
+
image_ids_pad = [[-iid] * num_img_tokens[iid - 1] for iid in image_ids]
|
|
387
|
+
|
|
388
|
+
# Split text by image pattern and tokenize each chunk
|
|
389
|
+
text_chunks = texts.split("<|image_")
|
|
390
|
+
|
|
391
|
+
# Reconstruct the split to handle the pattern properly
|
|
392
|
+
prompt_chunks = []
|
|
393
|
+
for i, chunk in enumerate(re.split(pattern, texts)):
|
|
394
|
+
tokens = self.tokenizer.encode(chunk, add_special_tokens=(i == 0))
|
|
395
|
+
prompt_chunks.append(tokens)
|
|
396
|
+
|
|
397
|
+
# Interleave text chunks with image token sequences
|
|
398
|
+
input_ids = []
|
|
399
|
+
img_idx = 0
|
|
400
|
+
for i, chunk in enumerate(prompt_chunks):
|
|
401
|
+
# Add text tokens (skip BOS if not first chunk)
|
|
402
|
+
offset = 0 if i == 0 else 1 # Skip BOS token for subsequent chunks
|
|
403
|
+
if i > 0 and len(chunk) > 0 and chunk[0] == self.tokenizer.bos_token_id:
|
|
404
|
+
offset = 1
|
|
405
|
+
input_ids.extend(chunk[offset:])
|
|
406
|
+
|
|
407
|
+
# Add image tokens if there's a corresponding image
|
|
408
|
+
if img_idx < len(image_ids_pad):
|
|
409
|
+
input_ids.extend(image_ids_pad[img_idx])
|
|
410
|
+
img_idx += 1
|
|
411
|
+
else:
|
|
412
|
+
# No image tokens, just tokenize normally
|
|
413
|
+
input_ids = self.tokenizer.encode(texts)
|
|
414
|
+
|
|
415
|
+
# Create attention mask (all tokens including negative IDs are attended to)
|
|
416
|
+
attention_mask = [1] * len(input_ids)
|
|
417
|
+
|
|
418
|
+
text_inputs = {
|
|
419
|
+
"input_ids": mx.array([input_ids]),
|
|
420
|
+
"attention_mask": mx.array([attention_mask]),
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
return BatchFeature(data={**text_inputs, **image_inputs})
|
|
424
|
+
|
|
425
|
+
def __call__(
|
|
426
|
+
self,
|
|
427
|
+
images: ImageInput = None,
|
|
428
|
+
text: Union[
|
|
429
|
+
TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
|
|
430
|
+
] = None,
|
|
431
|
+
**kwargs,
|
|
432
|
+
) -> BatchFeature:
|
|
433
|
+
"""
|
|
434
|
+
Main method to prepare for the model one or several sequences(s) and image(s).
|
|
435
|
+
|
|
436
|
+
Args:
|
|
437
|
+
images: The image or batch of images to be prepared.
|
|
438
|
+
text: The sequence or batch of sequences to be encoded.
|
|
439
|
+
return_tensors: If set, will return tensors of a particular framework.
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
BatchFeature with input_ids, attention_mask, pixel_values, and image_sizes.
|
|
443
|
+
"""
|
|
444
|
+
if images is None and text is None:
|
|
445
|
+
raise ValueError("You have to specify at least one of `images` or `text`.")
|
|
446
|
+
|
|
447
|
+
# Check if images and text inputs are reversed for BC
|
|
448
|
+
images, text = _validate_images_text_input_order(images, text)
|
|
449
|
+
|
|
450
|
+
# Extract return_tensors from kwargs (unused, we always return MLX arrays)
|
|
451
|
+
kwargs.pop("return_tensors", None)
|
|
452
|
+
padding = kwargs.pop("padding", False)
|
|
453
|
+
truncation = kwargs.pop("truncation", None)
|
|
454
|
+
max_length = kwargs.pop("max_length", None)
|
|
455
|
+
|
|
456
|
+
# Convert to list if single text
|
|
457
|
+
if isinstance(text, str):
|
|
458
|
+
texts = [text]
|
|
459
|
+
elif text is not None:
|
|
460
|
+
texts = list(text)
|
|
461
|
+
else:
|
|
462
|
+
texts = None
|
|
463
|
+
|
|
464
|
+
# Convert images to list if needed
|
|
465
|
+
if images is not None:
|
|
466
|
+
if not isinstance(images, list):
|
|
467
|
+
images = [images]
|
|
468
|
+
else:
|
|
469
|
+
images = []
|
|
470
|
+
|
|
471
|
+
# Process images and text together (handles image token replacement)
|
|
472
|
+
if texts is not None:
|
|
473
|
+
# For now, handle single text input (batching can be added later)
|
|
474
|
+
if len(texts) == 1:
|
|
475
|
+
return self._convert_images_texts_to_inputs(
|
|
476
|
+
images=images,
|
|
477
|
+
texts=texts[0],
|
|
478
|
+
padding=padding,
|
|
479
|
+
truncation=truncation,
|
|
480
|
+
max_length=max_length,
|
|
481
|
+
)
|
|
482
|
+
else:
|
|
483
|
+
# Batch processing: process each text separately and combine
|
|
484
|
+
all_input_ids = []
|
|
485
|
+
all_attention_masks = []
|
|
486
|
+
all_pixel_values = []
|
|
487
|
+
all_image_sizes = []
|
|
488
|
+
|
|
489
|
+
for txt in texts:
|
|
490
|
+
result = self._convert_images_texts_to_inputs(
|
|
491
|
+
images=images,
|
|
492
|
+
texts=txt,
|
|
493
|
+
padding=padding,
|
|
494
|
+
truncation=truncation,
|
|
495
|
+
max_length=max_length,
|
|
496
|
+
)
|
|
497
|
+
all_input_ids.append(result["input_ids"][0].tolist())
|
|
498
|
+
all_attention_masks.append(result["attention_mask"][0].tolist())
|
|
499
|
+
if "pixel_values" in result:
|
|
500
|
+
all_pixel_values.append(result["pixel_values"])
|
|
501
|
+
if "image_sizes" in result:
|
|
502
|
+
all_image_sizes.append(result["image_sizes"])
|
|
503
|
+
|
|
504
|
+
# Pad input_ids and attention_masks to same length
|
|
505
|
+
max_len = max(len(ids) for ids in all_input_ids)
|
|
506
|
+
pad_token_id = self.tokenizer.pad_token_id or 0
|
|
507
|
+
|
|
508
|
+
padded_input_ids = []
|
|
509
|
+
padded_attention_masks = []
|
|
510
|
+
for ids, mask in zip(all_input_ids, all_attention_masks):
|
|
511
|
+
padding_length = max_len - len(ids)
|
|
512
|
+
padded_input_ids.append(ids + [pad_token_id] * padding_length)
|
|
513
|
+
padded_attention_masks.append(mask + [0] * padding_length)
|
|
514
|
+
|
|
515
|
+
data = {
|
|
516
|
+
"input_ids": mx.array(padded_input_ids),
|
|
517
|
+
"attention_mask": mx.array(padded_attention_masks),
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
if all_pixel_values:
|
|
521
|
+
data["pixel_values"] = all_pixel_values[
|
|
522
|
+
0
|
|
523
|
+
] # Same images for all texts
|
|
524
|
+
if all_image_sizes:
|
|
525
|
+
data["image_sizes"] = all_image_sizes[0]
|
|
526
|
+
|
|
527
|
+
return BatchFeature(data=data)
|
|
528
|
+
|
|
529
|
+
# Text-only case
|
|
530
|
+
if images:
|
|
531
|
+
image_inputs = self.image_processor(images)
|
|
532
|
+
else:
|
|
533
|
+
image_inputs = {}
|
|
534
|
+
|
|
535
|
+
return BatchFeature(data=image_inputs)
|
|
536
|
+
|
|
537
|
+
def batch_decode(self, *args, **kwargs):
|
|
538
|
+
"""Forward to tokenizer's batch_decode."""
|
|
539
|
+
return self.tokenizer.batch_decode(*args, **kwargs)
|
|
540
|
+
|
|
541
|
+
def decode(self, *args, **kwargs):
|
|
542
|
+
"""Forward to tokenizer's decode."""
|
|
543
|
+
return self.tokenizer.decode(*args, **kwargs)
|
|
544
|
+
|
|
545
|
+
def apply_chat_template(
|
|
546
|
+
self,
|
|
547
|
+
conversation,
|
|
548
|
+
chat_template=None,
|
|
549
|
+
add_generation_prompt=False,
|
|
550
|
+
tokenize=False,
|
|
551
|
+
**kwargs,
|
|
552
|
+
):
|
|
553
|
+
"""Apply chat template to the conversation."""
|
|
554
|
+
if chat_template is None:
|
|
555
|
+
chat_template = self.chat_template
|
|
556
|
+
if chat_template is None:
|
|
557
|
+
chat_template = getattr(self.tokenizer, "chat_template", None)
|
|
558
|
+
|
|
559
|
+
if chat_template is None:
|
|
560
|
+
raise ValueError(
|
|
561
|
+
"No chat template found. Please provide a chat_template argument "
|
|
562
|
+
"or ensure the tokenizer has a chat_template attribute."
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
try:
|
|
566
|
+
from jinja2 import Template
|
|
567
|
+
except ImportError:
|
|
568
|
+
raise ImportError("jinja2 is required for apply_chat_template")
|
|
569
|
+
|
|
570
|
+
template = Template(chat_template)
|
|
571
|
+
rendered = template.render(
|
|
572
|
+
messages=conversation,
|
|
573
|
+
add_generation_prompt=add_generation_prompt,
|
|
574
|
+
**kwargs,
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
if tokenize:
|
|
578
|
+
return self.tokenizer.encode(rendered)
|
|
579
|
+
return rendered
|
|
580
|
+
|
|
581
|
+
@property
|
|
582
|
+
def model_input_names(self):
|
|
583
|
+
"""Get the model input names from tokenizer and image processor."""
|
|
584
|
+
tokenizer_input_names = self.tokenizer.model_input_names
|
|
585
|
+
image_processor_input_names = self.image_processor.model_input_names
|
|
586
|
+
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
|
587
|
+
|
|
588
|
+
@classmethod
|
|
589
|
+
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
|
590
|
+
"""Load the processor from a pretrained model path."""
|
|
591
|
+
from huggingface_hub import hf_hub_download
|
|
592
|
+
|
|
593
|
+
kwargs.pop("trust_remote_code", None)
|
|
594
|
+
|
|
595
|
+
model_path = Path(pretrained_model_name_or_path)
|
|
596
|
+
is_local = model_path.exists() and model_path.is_dir()
|
|
597
|
+
|
|
598
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
|
599
|
+
str(model_path) if is_local else pretrained_model_name_or_path,
|
|
600
|
+
trust_remote_code=True,
|
|
601
|
+
local_files_only=is_local,
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
# Load image processor config
|
|
605
|
+
image_processor_config = {}
|
|
606
|
+
try:
|
|
607
|
+
if is_local:
|
|
608
|
+
config_path = model_path / "preprocessor_config.json"
|
|
609
|
+
else:
|
|
610
|
+
config_path = Path(
|
|
611
|
+
hf_hub_download(
|
|
612
|
+
pretrained_model_name_or_path, "preprocessor_config.json"
|
|
613
|
+
)
|
|
614
|
+
)
|
|
615
|
+
if config_path.exists():
|
|
616
|
+
with open(config_path, "r", encoding="utf-8") as f:
|
|
617
|
+
preprocessor_config = json.load(f)
|
|
618
|
+
if "num_crops" in preprocessor_config:
|
|
619
|
+
image_processor_config["num_crops"] = preprocessor_config[
|
|
620
|
+
"num_crops"
|
|
621
|
+
]
|
|
622
|
+
if "num_img_tokens" in preprocessor_config:
|
|
623
|
+
image_processor_config["num_img_tokens"] = preprocessor_config[
|
|
624
|
+
"num_img_tokens"
|
|
625
|
+
]
|
|
626
|
+
if "image_mean" in preprocessor_config:
|
|
627
|
+
image_processor_config["image_mean"] = tuple(
|
|
628
|
+
preprocessor_config["image_mean"]
|
|
629
|
+
)
|
|
630
|
+
if "image_std" in preprocessor_config:
|
|
631
|
+
image_processor_config["image_std"] = tuple(
|
|
632
|
+
preprocessor_config["image_std"]
|
|
633
|
+
)
|
|
634
|
+
except Exception:
|
|
635
|
+
pass
|
|
636
|
+
|
|
637
|
+
image_processor = Phi3VImageProcessor(**image_processor_config)
|
|
638
|
+
|
|
639
|
+
# Load chat template from jinja file if not already set on tokenizer
|
|
640
|
+
chat_template = getattr(tokenizer, "chat_template", None)
|
|
641
|
+
if chat_template is None:
|
|
642
|
+
try:
|
|
643
|
+
if is_local:
|
|
644
|
+
jinja_path = model_path / "chat_template.jinja"
|
|
645
|
+
else:
|
|
646
|
+
jinja_path = Path(
|
|
647
|
+
hf_hub_download(
|
|
648
|
+
pretrained_model_name_or_path, "chat_template.jinja"
|
|
649
|
+
)
|
|
650
|
+
)
|
|
651
|
+
if jinja_path.exists():
|
|
652
|
+
chat_template = jinja_path.read_text(encoding="utf-8")
|
|
653
|
+
tokenizer.chat_template = chat_template
|
|
654
|
+
except Exception:
|
|
655
|
+
pass
|
|
656
|
+
|
|
657
|
+
return cls(
|
|
658
|
+
image_processor=image_processor,
|
|
659
|
+
tokenizer=tokenizer,
|
|
660
|
+
chat_template=chat_template,
|
|
661
|
+
)
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
# Register the processor with AutoProcessor
|
|
665
|
+
from transformers import AutoProcessor
|
|
666
|
+
|
|
667
|
+
_original_auto_processor_from_pretrained = AutoProcessor.from_pretrained
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
@classmethod
|
|
671
|
+
def _patched_auto_processor_from_pretrained(
|
|
672
|
+
cls, pretrained_model_name_or_path, **kwargs
|
|
673
|
+
):
|
|
674
|
+
"""Patched from_pretrained that returns Phi3VProcessor for phi3_v models."""
|
|
675
|
+
from huggingface_hub import hf_hub_download
|
|
676
|
+
|
|
677
|
+
model_path = Path(pretrained_model_name_or_path)
|
|
678
|
+
is_local = model_path.exists() and model_path.is_dir()
|
|
679
|
+
|
|
680
|
+
# Check if this is a phi3_v model
|
|
681
|
+
is_phi3_v = False
|
|
682
|
+
try:
|
|
683
|
+
if is_local:
|
|
684
|
+
config_path = model_path / "config.json"
|
|
685
|
+
else:
|
|
686
|
+
config_path = Path(
|
|
687
|
+
hf_hub_download(pretrained_model_name_or_path, "config.json")
|
|
688
|
+
)
|
|
689
|
+
with open(config_path, "r", encoding="utf-8") as f:
|
|
690
|
+
config = json.load(f)
|
|
691
|
+
model_type = config.get("model_type", "").lower()
|
|
692
|
+
is_phi3_v = model_type in ("phi3_v", "phi3-v", "phi3v")
|
|
693
|
+
except Exception:
|
|
694
|
+
pass
|
|
695
|
+
|
|
696
|
+
if is_phi3_v:
|
|
697
|
+
return Phi3VProcessor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
698
|
+
|
|
699
|
+
return _original_auto_processor_from_pretrained.__func__(
|
|
700
|
+
cls, pretrained_model_name_or_path, **kwargs
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
AutoProcessor.from_pretrained = _patched_auto_processor_from_pretrained
|