PyPI - fount-vlm-nell-02 - Versions diffs - 0.3.11__py3-none-any.whl - Mend

fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
mlx_vlm/__init__.py +16 -0
mlx_vlm/__main__.py +24 -0
mlx_vlm/chat.py +234 -0
mlx_vlm/chat_ui.py +508 -0
mlx_vlm/convert.py +284 -0
mlx_vlm/deprecation.py +52 -0
mlx_vlm/evals/__init__.py +0 -0
mlx_vlm/evals/math_vista.py +565 -0
mlx_vlm/evals/mmmu.py +528 -0
mlx_vlm/evals/mmstar.py +343 -0
mlx_vlm/evals/ocrbench.py +453 -0
mlx_vlm/evals/utils.py +37 -0
mlx_vlm/generate.py +1457 -0
mlx_vlm/lora.py +207 -0
mlx_vlm/models/__init__.py +0 -0
mlx_vlm/models/aya_vision/__init__.py +2 -0
mlx_vlm/models/aya_vision/aya_vision.py +188 -0
mlx_vlm/models/aya_vision/config.py +52 -0
mlx_vlm/models/aya_vision/language.py +202 -0
mlx_vlm/models/aya_vision/vision.py +340 -0
mlx_vlm/models/base.py +356 -0
mlx_vlm/models/cache.py +238 -0
mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
mlx_vlm/models/deepseekocr/__init__.py +2 -0
mlx_vlm/models/deepseekocr/config.py +173 -0
mlx_vlm/models/deepseekocr/conversation.py +264 -0
mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
mlx_vlm/models/deepseekocr/language.py +547 -0
mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
mlx_vlm/models/deepseekocr/sam.py +489 -0
mlx_vlm/models/deepseekocr/vision.py +263 -0
mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
mlx_vlm/models/deepseekocr_2/config.py +216 -0
mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
mlx_vlm/models/deepseekocr_2/vision.py +439 -0
mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
mlx_vlm/models/fastvlm/__init__.py +2 -0
mlx_vlm/models/fastvlm/config.py +79 -0
mlx_vlm/models/fastvlm/fastvlm.py +198 -0
mlx_vlm/models/fastvlm/language.py +49 -0
mlx_vlm/models/fastvlm/vision.py +692 -0
mlx_vlm/models/florence2/__init__.py +2 -0
mlx_vlm/models/florence2/config.py +84 -0
mlx_vlm/models/florence2/florence2.py +383 -0
mlx_vlm/models/florence2/language.py +452 -0
mlx_vlm/models/florence2/processing_florence2.py +30 -0
mlx_vlm/models/florence2/vision.py +552 -0
mlx_vlm/models/gemma3/__init__.py +2 -0
mlx_vlm/models/gemma3/config.py +52 -0
mlx_vlm/models/gemma3/gemma3.py +194 -0
mlx_vlm/models/gemma3/language.py +293 -0
mlx_vlm/models/gemma3/vision.py +215 -0
mlx_vlm/models/gemma3n/__init__.py +2 -0
mlx_vlm/models/gemma3n/audio.py +1038 -0
mlx_vlm/models/gemma3n/config.py +130 -0
mlx_vlm/models/gemma3n/gemma3n.py +322 -0
mlx_vlm/models/gemma3n/language.py +631 -0
mlx_vlm/models/gemma3n/vision.py +994 -0
mlx_vlm/models/glm4v/__init__.py +3 -0
mlx_vlm/models/glm4v/config.py +79 -0
mlx_vlm/models/glm4v/glm4v.py +188 -0
mlx_vlm/models/glm4v/language.py +574 -0
mlx_vlm/models/glm4v/processing.py +220 -0
mlx_vlm/models/glm4v/vision.py +406 -0
mlx_vlm/models/glm4v_moe/__init__.py +3 -0
mlx_vlm/models/glm4v_moe/config.py +81 -0
mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
mlx_vlm/models/glm4v_moe/language.py +674 -0
mlx_vlm/models/glm4v_moe/processing.py +229 -0
mlx_vlm/models/glm4v_moe/vision.py +405 -0
mlx_vlm/models/glm_ocr/__init__.py +3 -0
mlx_vlm/models/glm_ocr/config.py +93 -0
mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
mlx_vlm/models/glm_ocr/language.py +585 -0
mlx_vlm/models/glm_ocr/processing.py +208 -0
mlx_vlm/models/glm_ocr/vision.py +342 -0
mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
mlx_vlm/models/hunyuan_vl/config.py +136 -0
mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
mlx_vlm/models/hunyuan_vl/language.py +509 -0
mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
mlx_vlm/models/hunyuan_vl/vision.py +322 -0
mlx_vlm/models/idefics2/__init__.py +2 -0
mlx_vlm/models/idefics2/config.py +65 -0
mlx_vlm/models/idefics2/idefics2.py +321 -0
mlx_vlm/models/idefics2/language.py +161 -0
mlx_vlm/models/idefics2/vision.py +244 -0
mlx_vlm/models/idefics3/__init__.py +4 -0
mlx_vlm/models/idefics3/config.py +54 -0
mlx_vlm/models/idefics3/idefics3.py +221 -0
mlx_vlm/models/idefics3/language.py +157 -0
mlx_vlm/models/idefics3/vision.py +265 -0
mlx_vlm/models/internvl_chat/__init__.py +3 -0
mlx_vlm/models/internvl_chat/config.py +89 -0
mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
mlx_vlm/models/internvl_chat/language.py +187 -0
mlx_vlm/models/internvl_chat/processor.py +395 -0
mlx_vlm/models/internvl_chat/vision.py +265 -0
mlx_vlm/models/interpolate.py +183 -0
mlx_vlm/models/jina_vlm/__init__.py +3 -0
mlx_vlm/models/jina_vlm/config.py +142 -0
mlx_vlm/models/jina_vlm/image_processor.py +430 -0
mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
mlx_vlm/models/jina_vlm/language.py +272 -0
mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
mlx_vlm/models/jina_vlm/vision.py +202 -0
mlx_vlm/models/kernels.py +447 -0
mlx_vlm/models/kimi_vl/__init__.py +4 -0
mlx_vlm/models/kimi_vl/config.py +84 -0
mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
mlx_vlm/models/kimi_vl/language.py +460 -0
mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
mlx_vlm/models/kimi_vl/vision.py +485 -0
mlx_vlm/models/lfm2_vl/__init__.py +2 -0
mlx_vlm/models/lfm2_vl/config.py +94 -0
mlx_vlm/models/lfm2_vl/language.py +49 -0
mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
mlx_vlm/models/lfm2_vl/vision.py +223 -0
mlx_vlm/models/llama4/__init__.py +2 -0
mlx_vlm/models/llama4/config.py +83 -0
mlx_vlm/models/llama4/language.py +334 -0
mlx_vlm/models/llama4/llama4.py +146 -0
mlx_vlm/models/llama4/vision.py +526 -0
mlx_vlm/models/llava/__init__.py +2 -0
mlx_vlm/models/llava/config.py +61 -0
mlx_vlm/models/llava/language.py +200 -0
mlx_vlm/models/llava/llava.py +132 -0
mlx_vlm/models/llava/vision.py +233 -0
mlx_vlm/models/llava_bunny/__init__.py +2 -0
mlx_vlm/models/llava_bunny/config.py +85 -0
mlx_vlm/models/llava_bunny/language.py +194 -0
mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
mlx_vlm/models/llava_bunny/vision.py +278 -0
mlx_vlm/models/llava_next/__init__.py +2 -0
mlx_vlm/models/llava_next/config.py +60 -0
mlx_vlm/models/llava_next/language.py +192 -0
mlx_vlm/models/llava_next/llava_next.py +138 -0
mlx_vlm/models/llava_next/vision.py +217 -0
mlx_vlm/models/mistral3/__init__.py +2 -0
mlx_vlm/models/mistral3/config.py +59 -0
mlx_vlm/models/mistral3/language.py +269 -0
mlx_vlm/models/mistral3/mistral3.py +383 -0
mlx_vlm/models/mllama/__init__.py +4 -0
mlx_vlm/models/mllama/config.py +74 -0
mlx_vlm/models/mllama/language.py +377 -0
mlx_vlm/models/mllama/mllama.py +210 -0
mlx_vlm/models/mllama/vision.py +458 -0
mlx_vlm/models/molmo/__init__.py +5 -0
mlx_vlm/models/molmo/config.py +93 -0
mlx_vlm/models/molmo/language.py +208 -0
mlx_vlm/models/molmo/molmo.py +108 -0
mlx_vlm/models/molmo/processing_molmo.py +763 -0
mlx_vlm/models/molmo/vision.py +408 -0
mlx_vlm/models/molmo2/__init__.py +6 -0
mlx_vlm/models/molmo2/config.py +137 -0
mlx_vlm/models/molmo2/language.py +206 -0
mlx_vlm/models/molmo2/molmo2.py +330 -0
mlx_vlm/models/molmo2/processing.py +773 -0
mlx_vlm/models/molmo2/vision.py +286 -0
mlx_vlm/models/moondream2/__init__.py +11 -0
mlx_vlm/models/moondream2/config.py +92 -0
mlx_vlm/models/moondream2/image_crops.py +269 -0
mlx_vlm/models/moondream2/language.py +267 -0
mlx_vlm/models/moondream2/moondream2.py +522 -0
mlx_vlm/models/moondream2/processing_moondream.py +144 -0
mlx_vlm/models/moondream2/vision.py +200 -0
mlx_vlm/models/multi_modality/__init__.py +4 -0
mlx_vlm/models/multi_modality/config.py +108 -0
mlx_vlm/models/multi_modality/language.py +191 -0
mlx_vlm/models/multi_modality/multi_modality.py +338 -0
mlx_vlm/models/multi_modality/sam.py +543 -0
mlx_vlm/models/multi_modality/vision.py +450 -0
mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
mlx_vlm/models/paddleocr_vl/config.py +93 -0
mlx_vlm/models/paddleocr_vl/language.py +522 -0
mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
mlx_vlm/models/paddleocr_vl/vision.py +358 -0
mlx_vlm/models/paligemma/__init__.py +4 -0
mlx_vlm/models/paligemma/config.py +50 -0
mlx_vlm/models/paligemma/language.py +253 -0
mlx_vlm/models/paligemma/paligemma.py +140 -0
mlx_vlm/models/paligemma/vision.py +218 -0
mlx_vlm/models/phi3_v/__init__.py +5 -0
mlx_vlm/models/phi3_v/config.py +55 -0
mlx_vlm/models/phi3_v/language.py +2 -0
mlx_vlm/models/phi3_v/phi3_v.py +239 -0
mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
mlx_vlm/models/phi3_v/vision.py +294 -0
mlx_vlm/models/pixtral/__init__.py +4 -0
mlx_vlm/models/pixtral/config.py +69 -0
mlx_vlm/models/pixtral/language.py +195 -0
mlx_vlm/models/pixtral/pixtral.py +208 -0
mlx_vlm/models/pixtral/vision.py +293 -0
mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_5_vl/config.py +90 -0
mlx_vlm/models/qwen2_5_vl/language.py +541 -0
mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
mlx_vlm/models/qwen2_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_vl/config.py +86 -0
mlx_vlm/models/qwen2_vl/language.py +539 -0
mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
mlx_vlm/models/qwen2_vl/vision.py +308 -0
mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
mlx_vlm/models/qwen3_vl/__init__.py +2 -0
mlx_vlm/models/qwen3_vl/config.py +103 -0
mlx_vlm/models/qwen3_vl/language.py +596 -0
mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
mlx_vlm/models/qwen3_vl/vision.py +441 -0
mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
mlx_vlm/models/smolvlm/__init__.py +4 -0
mlx_vlm/models/smolvlm/config.py +59 -0
mlx_vlm/models/smolvlm/smolvlm.py +60 -0
mlx_vlm/prompt_utils.py +565 -0
mlx_vlm/sample_utils.py +39 -0
mlx_vlm/server.py +1107 -0
mlx_vlm/smolvlm_video_generate.py +109 -0
mlx_vlm/tokenizer_utils.py +371 -0
mlx_vlm/trainer/__init__.py +9 -0
mlx_vlm/trainer/lora.py +70 -0
mlx_vlm/trainer/trainer.py +299 -0
mlx_vlm/trainer/utils.py +160 -0
mlx_vlm/utils.py +1339 -0
mlx_vlm/version.py +1 -0
mlx_vlm/video_generate.py +611 -0

mlx_vlm/models/ernie4_5_moe_vl/processor.py ADDED Viewed

@@ -0,0 +1,686 @@
+"""Image processor and Processor for ERNIE 4.5 VL MoE."""
+import math
+import os
+from shutil import copyfile
+from typing import Dict, List, Optional, Tuple, Union
+import mlx.core as mx
+import numpy as np
+import sentencepiece as spm
+from PIL import Image
+from transformers import AutoImageProcessor, AutoProcessor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_processing_utils import (
+    BaseImageProcessor as HFBaseImageProcessor,
+)
+from transformers.image_transforms import (
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    is_valid_image,
+    to_numpy_array,
+)
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+class Ernie4_5_VLTokenizer(PreTrainedTokenizer):
+    """Tokenizer for ERNIE 4.5 VL model using SentencePiece."""
+    vocab_files_names = {"vocab_file": "tokenizer.model"}
+    model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
+    padding_side = "right"
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        cls_token="<|begin_of_sentence|>",
+        eos_token="</s>",
+        mask_token="<mask:1>",
+        pad_token="<unk>",
+        sep_token="<|end_of_sentence|>",
+        unk_token="<unk>",
+        additional_special_tokens=None,
+        chat_template=None,
+        **kwargs,
+    ):
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+        if additional_special_tokens is None:
+            additional_special_tokens = ["<mask:1>", "<mask:7>"]
+        # Load chat_template from tokenizer_config.json if not provided
+        if chat_template is None:
+            import json
+            config_file = os.path.join(
+                os.path.dirname(vocab_file), "tokenizer_config.json"
+            )
+            if os.path.exists(config_file):
+                with open(config_file, "r") as f:
+                    config = json.load(f)
+                    chat_template = config.get("chat_template")
+        super().__init__(
+            bos_token=bos_token,
+            cls_token=cls_token,
+            eos_token=eos_token,
+            mask_token=mask_token,
+            pad_token=pad_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            additional_special_tokens=additional_special_tokens,
+            chat_template=chat_template,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self):
+        return self.sp_model.vocab_size()
+    @property
+    def space_token_id(self):
+        return self.sp_model.piece_to_id("<mask:1>")
+    @property
+    def gend_token_id(self):
+        return self.sp_model.piece_to_id("<mask:7>")
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text):
+        return self.sp_model.encode_as_pieces(text)
+    def _convert_token_to_id(self, token):
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, id):
+        return self.sp_model.id_to_piece(id)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if token_ids_1 is None:
+            return token_ids_0
+        return token_ids_0 + token_ids_1
+    def convert_tokens_to_string(self, tokens):
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+    def save_vocabulary(
+        self, save_directory, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            return None
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "")
+            + self.vocab_files_names["vocab_file"],
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+            out_vocab_file
+        ) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def _decode(self, *args, **kwargs):
+        kwargs.pop("clean_up_tokenization_spaces", None)
+        kwargs.pop("spaces_between_special_tokens", None)
+        return super()._decode(
+            *args,
+            **kwargs,
+            clean_up_tokenization_spaces=False,
+            spaces_between_special_tokens=False,
+        )
+def _validate_images_text_input_order(images, text):
+    if isinstance(images, str) and text is None:
+        return None, images
+    if images is not None and text is not None:
+        if isinstance(images, str) and not isinstance(text, str):
+            return text, images
+    return images, text
+def round_by_factor(number: int, factor: int) -> int:
+    return round(number / factor) * factor
+def ceil_by_factor(number: int, factor: int) -> int:
+    return math.ceil(number / factor) * factor
+def floor_by_factor(number: int, factor: int) -> int:
+    return math.floor(number / factor) * factor
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 56 * 56,
+    max_pixels: int = 28 * 28 * 1280,
+) -> Tuple[int, int]:
+    MAX_RATIO = 200
+    if height / width > MAX_RATIO:
+        width = height // MAX_RATIO
+    elif width / height > MAX_RATIO:
+        height = width // MAX_RATIO
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(int(height / beta), factor)
+        w_bar = floor_by_factor(int(width / beta), factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(int(height * beta), factor)
+        w_bar = ceil_by_factor(int(width * beta), factor)
+    h_bar = max(factor, h_bar)
+    w_bar = max(factor, w_bar)
+    return h_bar, w_bar
+class ImageProcessor(HFBaseImageProcessor):
+    """Image processor for ERNIE 4.5 VL MoE model."""
+    model_input_names = ["pixel_values", "image_grid_thw"]
+    def __init__(
+        self,
+        image_mean: Tuple[float, ...] = (0.48145466, 0.4578275, 0.40821073),
+        image_std: Tuple[float, ...] = (0.26862954, 0.26130258, 0.27577711),
+        size: Tuple[int, int] = (224, 224),
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        rescale_factor: float = 1 / 255,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        patch_size: int = 14,
+        merge_size: int = 2,
+        temporal_patch_size: int = 2,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 28 * 28 * 1280,
+        config=None,
+        **kwargs,
+    ):
+        if config is not None:
+            if isinstance(config, dict):
+                vision_config = config.get("vision_config", {})
+                image_mean = config.get("image_mean", image_mean)
+                image_std = config.get("image_std", image_std)
+                min_pixels = config.get("min_pixels", min_pixels)
+                max_pixels = config.get("max_pixels", max_pixels)
+                patch_size = vision_config.get(
+                    "patch_size", config.get("patch_size", patch_size)
+                )
+                merge_size = vision_config.get(
+                    "spatial_merge_size", config.get("spatial_merge_size", merge_size)
+                )
+                temporal_patch_size = vision_config.get(
+                    "temporal_patch_size",
+                    config.get("temporal_patch_size", temporal_patch_size),
+                )
+            else:
+                patch_size = getattr(config, "patch_size", patch_size)
+                merge_size = getattr(
+                    config,
+                    "spatial_merge_size",
+                    getattr(config, "merge_size", merge_size),
+                )
+                temporal_patch_size = getattr(
+                    config, "temporal_patch_size", temporal_patch_size
+                )
+        HFBaseImageProcessor.__init__(self, **kwargs)
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.data_format = data_format
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.factor = patch_size * merge_size
+    def get_smart_resize(
+        self,
+        height: int,
+        width: int,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+    ) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+        actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=self.factor,
+            min_pixels=actual_min_pixels,
+            max_pixels=actual_max_pixels,
+        )
+        grid_h = resized_height // self.patch_size
+        grid_w = resized_width // self.patch_size
+        return (resized_height, resized_width), (grid_h, grid_w)
+    def _extract_patches(
+        self,
+        image: np.ndarray,
+        grid_h: int,
+        grid_w: int,
+    ) -> np.ndarray:
+        C, H, W = image.shape
+        patches = image.reshape(
+            C,
+            grid_h // self.merge_size,
+            self.merge_size,
+            self.patch_size,
+            grid_w // self.merge_size,
+            self.merge_size,
+            self.patch_size,
+        )
+        patches = patches.transpose(1, 4, 2, 5, 0, 3, 6)
+        num_patches = (
+            (grid_h // self.merge_size)
+            * (grid_w // self.merge_size)
+            * (self.merge_size**2)
+        )
+        patches = patches.reshape(num_patches, C * self.patch_size * self.patch_size)
+        return patches
+    def preprocess(
+        self,
+        images: Union[Image.Image, List[Image.Image]],
+        return_grid_thw: bool = True,
+    ) -> Union[np.ndarray, Dict]:
+        if isinstance(images, Image.Image):
+            images = [images]
+        all_patches = []
+        all_grid_thw = []
+        for image in images:
+            if image.mode != "RGB":
+                image = image.convert("RGB")
+            (resized_h, resized_w), (grid_h, grid_w) = self.get_smart_resize(
+                image.height, image.width
+            )
+            img_array = to_numpy_array(image)
+            img_array = resize(
+                img_array,
+                size=(resized_h, resized_w),
+                resample=self.resample,
+                data_format=ChannelDimension.LAST,
+                input_data_format=ChannelDimension.LAST,
+            )
+            img_array = rescale(
+                img_array,
+                scale=self.rescale_factor,
+                data_format=ChannelDimension.LAST,
+                input_data_format=ChannelDimension.LAST,
+            )
+            img_array = normalize(
+                img_array,
+                mean=self.image_mean,
+                std=self.image_std,
+                data_format=ChannelDimension.LAST,
+                input_data_format=ChannelDimension.LAST,
+            )
+            img_array = to_channel_dimension_format(
+                img_array,
+                channel_dim=ChannelDimension.FIRST,
+                input_channel_dim=ChannelDimension.LAST,
+            )
+            patches = self._extract_patches(img_array, grid_h, grid_w)
+            all_patches.append(patches)
+            all_grid_thw.append([1, grid_h, grid_w])
+        pixel_values = np.concatenate(all_patches, axis=0)
+        if return_grid_thw:
+            return {
+                "pixel_values": pixel_values,
+                "image_grid_thw": np.array(all_grid_thw, dtype=np.int64),
+            }
+        return pixel_values
+    def preprocess_video(
+        self,
+        frames: List[Image.Image],
+        return_grid_thw: bool = True,
+    ) -> Union[np.ndarray, Dict]:
+        if not frames:
+            raise ValueError("frames list cannot be empty")
+        first_frame = frames[0]
+        if first_frame.mode != "RGB":
+            first_frame = first_frame.convert("RGB")
+        (resized_h, resized_w), (grid_h, grid_w) = self.get_smart_resize(
+            first_frame.height, first_frame.width
+        )
+        all_patches = []
+        for frame in frames:
+            if frame.mode != "RGB":
+                frame = frame.convert("RGB")
+            img_array = to_numpy_array(frame)
+            img_array = resize(
+                img_array,
+                size=(resized_h, resized_w),
+                resample=self.resample,
+                data_format=ChannelDimension.LAST,
+                input_data_format=ChannelDimension.LAST,
+            )
+            img_array = rescale(
+                img_array,
+                scale=self.rescale_factor,
+                data_format=ChannelDimension.LAST,
+                input_data_format=ChannelDimension.LAST,
+            )
+            img_array = normalize(
+                img_array,
+                mean=self.image_mean,
+                std=self.image_std,
+                data_format=ChannelDimension.LAST,
+                input_data_format=ChannelDimension.LAST,
+            )
+            img_array = to_channel_dimension_format(
+                img_array,
+                channel_dim=ChannelDimension.FIRST,
+                input_channel_dim=ChannelDimension.LAST,
+            )
+            patches = self._extract_patches(img_array, grid_h, grid_w)
+            all_patches.append(patches)
+        pixel_values = np.concatenate(all_patches, axis=0)
+        num_frames = len(frames)
+        grid_t = num_frames
+        if return_grid_thw:
+            return {
+                "pixel_values": pixel_values,
+                "video_grid_thw": np.array([[grid_t, grid_h, grid_w]], dtype=np.int64),
+            }
+        return pixel_values
+    def __call__(
+        self,
+        images: ImageInput,
+        **kwargs,
+    ) -> BatchFeature:
+        return self.preprocess(images, **kwargs)
+class Ernie4_5_VLProcessor(ProcessorMixin):
+    """Processor for ERNIE 4.5 VL that wraps image processor and tokenizer."""
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template", "spatial_conv_size", "temporal_conv_size"]
+    image_processor_class = "ImageProcessor"
+    tokenizer_class = "Ernie4_5_VLTokenizer"
+    IMG_START = "<|IMAGE_START|>"
+    IMG_END = "<|IMAGE_END|>"
+    VID_START = "<|VIDEO_START|>"
+    VID_END = "<|VIDEO_END|>"
+    IMAGE_PLACEHOLDER = "<|IMAGE_PLACEHOLDER|>"
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        spatial_conv_size: int = 2,
+        temporal_conv_size: int = 2,
+        **kwargs,
+    ):
+        if image_processor is None:
+            image_processor = ImageProcessor()
+        self.spatial_conv_size = spatial_conv_size
+        self.temporal_conv_size = temporal_conv_size
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+    @property
+    def pad_token(self):
+        return self.tokenizer.pad_token if self.tokenizer else None
+    @property
+    def pad_token_id(self):
+        return self.tokenizer.pad_token_id if self.tokenizer else None
+    @property
+    def eos_token(self):
+        return self.tokenizer.eos_token if self.tokenizer else None
+    @property
+    def eos_token_id(self):
+        return self.tokenizer.eos_token_id if self.tokenizer else None
+    @property
+    def bos_token(self):
+        return self.tokenizer.bos_token if self.tokenizer else None
+    @property
+    def bos_token_id(self):
+        return self.tokenizer.bos_token_id if self.tokenizer else None
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[
+            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
+        ] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        if images is None and text is None:
+            raise ValueError("You have to specify at least one of `images` or `text`.")
+        images, text = _validate_images_text_input_order(images, text)
+        kwargs.pop("return_tensors", None)
+        if images is not None:
+            if is_valid_image(images):
+                images = [images]
+            image_inputs = self.image_processor(images)
+            image_grid_thw = image_inputs["image_grid_thw"]
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+        if isinstance(text, str):
+            text = [text]
+        elif text is not None and not isinstance(text, list):
+            raise ValueError(
+                "Invalid input text. Please provide a string, or a list of strings"
+            )
+        if image_grid_thw is not None and text is not None:
+            merge_length = self.spatial_conv_size * self.spatial_conv_size
+            index = 0
+            for i in range(len(text)):
+                # Handle <|image@placeholder|> format used in chat templates
+                placeholder = f"{self.IMG_START}<|image@placeholder|>{self.IMG_END}"
+                while placeholder in text[i]:
+                    if index < len(image_grid_thw):
+                        grid_thw = image_grid_thw[index]
+                        # grid_thw is [t, h, w], compute number of tokens
+                        num_patches = int(np.prod(grid_thw))
+                        num_placeholders = num_patches // merge_length
+                        replacement = (
+                            f"{self.IMG_START}"
+                            f"{self.IMAGE_PLACEHOLDER * num_placeholders}"
+                            f"{self.IMG_END}"
+                        )
+                        text[i] = text[i].replace(placeholder, replacement, 1)
+                        index += 1
+                    else:
+                        break
+        if text is not None:
+            all_input_ids = []
+            for t in text:
+                ids = self.tokenizer.encode(t)
+                all_input_ids.append(ids)
+            max_len = max(len(ids) for ids in all_input_ids)
+            pad_token_id = self.tokenizer.pad_token_id or 0
+            padded_input_ids = []
+            attention_masks = []
+            for ids in all_input_ids:
+                padding_length = max_len - len(ids)
+                padded_ids = ids + [pad_token_id] * padding_length
+                mask = [1] * len(ids) + [0] * padding_length
+                padded_input_ids.append(padded_ids)
+                attention_masks.append(mask)
+            if images is None:
+                if len(padded_input_ids) == 1:
+                    text_inputs = {
+                        "input_ids": padded_input_ids[0],
+                        "attention_mask": attention_masks[0],
+                    }
+                else:
+                    text_inputs = {
+                        "input_ids": padded_input_ids,
+                        "attention_mask": attention_masks,
+                    }
+            else:
+                text_inputs = {
+                    "input_ids": mx.array(padded_input_ids),
+                    "attention_mask": mx.array(attention_masks),
+                }
+        else:
+            text_inputs = {}
+        if image_inputs:
+            image_inputs = {
+                "pixel_values": mx.array(image_inputs["pixel_values"]),
+                "image_grid_thw": mx.array(image_inputs["image_grid_thw"]),
+            }
+        return BatchFeature(data={**text_inputs, **image_inputs})
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+    def apply_chat_template(
+        self,
+        conversation,
+        chat_template=None,
+        add_generation_prompt=False,
+        tokenize=False,
+        **kwargs,
+    ):
+        if chat_template is None:
+            chat_template = self.chat_template
+        if chat_template is None:
+            chat_template = getattr(self.tokenizer, "chat_template", None)
+        if chat_template is None:
+            raise ValueError(
+                "No chat template found. Please provide a chat_template argument "
+                "or ensure the tokenizer has a chat_template attribute."
+            )
+        # Use jinja2 to render the template
+        try:
+            from jinja2 import Template
+        except ImportError:
+            raise ImportError("jinja2 is required for apply_chat_template")
+        template = Template(chat_template)
+        rendered = template.render(
+            messages=conversation,
+            add_generation_prompt=add_generation_prompt,
+            **kwargs,
+        )
+        if tokenize:
+            return self.tokenizer.encode(rendered)
+        return rendered
+    @staticmethod
+    def from_pretrained(pretrained_model_name_or_path, **kwargs):
+        from pathlib import Path
+        if not Path(pretrained_model_name_or_path).exists():
+            from huggingface_hub import snapshot_download
+            pretrained_model_name_or_path = snapshot_download(
+                pretrained_model_name_or_path,
+                allow_patterns=["*.json", "*.model", "*.txt"],
+            )
+        tokenizer = Ernie4_5_VLTokenizer.from_pretrained(pretrained_model_name_or_path)
+        image_processor = ImageProcessor()
+        return Ernie4_5_VLProcessor(
+            image_processor=image_processor, tokenizer=tokenizer
+        )
+MODEL_TYPE = "ernie4_5_moe_vl"
+try:
+    AutoImageProcessor.register(MODEL_TYPE, slow_image_processor_class=ImageProcessor)
+    AutoProcessor.register(MODEL_TYPE, Ernie4_5_VLProcessor)
+except Exception as e:
+    raise Exception(f"Error registering {MODEL_TYPE} processor: {e}")