PyPI - fount-vlm-nell-02 - Versions diffs - 0.3.11__py3-none-any.whl - Mend

fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
mlx_vlm/__init__.py +16 -0
mlx_vlm/__main__.py +24 -0
mlx_vlm/chat.py +234 -0
mlx_vlm/chat_ui.py +508 -0
mlx_vlm/convert.py +284 -0
mlx_vlm/deprecation.py +52 -0
mlx_vlm/evals/__init__.py +0 -0
mlx_vlm/evals/math_vista.py +565 -0
mlx_vlm/evals/mmmu.py +528 -0
mlx_vlm/evals/mmstar.py +343 -0
mlx_vlm/evals/ocrbench.py +453 -0
mlx_vlm/evals/utils.py +37 -0
mlx_vlm/generate.py +1457 -0
mlx_vlm/lora.py +207 -0
mlx_vlm/models/__init__.py +0 -0
mlx_vlm/models/aya_vision/__init__.py +2 -0
mlx_vlm/models/aya_vision/aya_vision.py +188 -0
mlx_vlm/models/aya_vision/config.py +52 -0
mlx_vlm/models/aya_vision/language.py +202 -0
mlx_vlm/models/aya_vision/vision.py +340 -0
mlx_vlm/models/base.py +356 -0
mlx_vlm/models/cache.py +238 -0
mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
mlx_vlm/models/deepseekocr/__init__.py +2 -0
mlx_vlm/models/deepseekocr/config.py +173 -0
mlx_vlm/models/deepseekocr/conversation.py +264 -0
mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
mlx_vlm/models/deepseekocr/language.py +547 -0
mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
mlx_vlm/models/deepseekocr/sam.py +489 -0
mlx_vlm/models/deepseekocr/vision.py +263 -0
mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
mlx_vlm/models/deepseekocr_2/config.py +216 -0
mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
mlx_vlm/models/deepseekocr_2/vision.py +439 -0
mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
mlx_vlm/models/fastvlm/__init__.py +2 -0
mlx_vlm/models/fastvlm/config.py +79 -0
mlx_vlm/models/fastvlm/fastvlm.py +198 -0
mlx_vlm/models/fastvlm/language.py +49 -0
mlx_vlm/models/fastvlm/vision.py +692 -0
mlx_vlm/models/florence2/__init__.py +2 -0
mlx_vlm/models/florence2/config.py +84 -0
mlx_vlm/models/florence2/florence2.py +383 -0
mlx_vlm/models/florence2/language.py +452 -0
mlx_vlm/models/florence2/processing_florence2.py +30 -0
mlx_vlm/models/florence2/vision.py +552 -0
mlx_vlm/models/gemma3/__init__.py +2 -0
mlx_vlm/models/gemma3/config.py +52 -0
mlx_vlm/models/gemma3/gemma3.py +194 -0
mlx_vlm/models/gemma3/language.py +293 -0
mlx_vlm/models/gemma3/vision.py +215 -0
mlx_vlm/models/gemma3n/__init__.py +2 -0
mlx_vlm/models/gemma3n/audio.py +1038 -0
mlx_vlm/models/gemma3n/config.py +130 -0
mlx_vlm/models/gemma3n/gemma3n.py +322 -0
mlx_vlm/models/gemma3n/language.py +631 -0
mlx_vlm/models/gemma3n/vision.py +994 -0
mlx_vlm/models/glm4v/__init__.py +3 -0
mlx_vlm/models/glm4v/config.py +79 -0
mlx_vlm/models/glm4v/glm4v.py +188 -0
mlx_vlm/models/glm4v/language.py +574 -0
mlx_vlm/models/glm4v/processing.py +220 -0
mlx_vlm/models/glm4v/vision.py +406 -0
mlx_vlm/models/glm4v_moe/__init__.py +3 -0
mlx_vlm/models/glm4v_moe/config.py +81 -0
mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
mlx_vlm/models/glm4v_moe/language.py +674 -0
mlx_vlm/models/glm4v_moe/processing.py +229 -0
mlx_vlm/models/glm4v_moe/vision.py +405 -0
mlx_vlm/models/glm_ocr/__init__.py +3 -0
mlx_vlm/models/glm_ocr/config.py +93 -0
mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
mlx_vlm/models/glm_ocr/language.py +585 -0
mlx_vlm/models/glm_ocr/processing.py +208 -0
mlx_vlm/models/glm_ocr/vision.py +342 -0
mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
mlx_vlm/models/hunyuan_vl/config.py +136 -0
mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
mlx_vlm/models/hunyuan_vl/language.py +509 -0
mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
mlx_vlm/models/hunyuan_vl/vision.py +322 -0
mlx_vlm/models/idefics2/__init__.py +2 -0
mlx_vlm/models/idefics2/config.py +65 -0
mlx_vlm/models/idefics2/idefics2.py +321 -0
mlx_vlm/models/idefics2/language.py +161 -0
mlx_vlm/models/idefics2/vision.py +244 -0
mlx_vlm/models/idefics3/__init__.py +4 -0
mlx_vlm/models/idefics3/config.py +54 -0
mlx_vlm/models/idefics3/idefics3.py +221 -0
mlx_vlm/models/idefics3/language.py +157 -0
mlx_vlm/models/idefics3/vision.py +265 -0
mlx_vlm/models/internvl_chat/__init__.py +3 -0
mlx_vlm/models/internvl_chat/config.py +89 -0
mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
mlx_vlm/models/internvl_chat/language.py +187 -0
mlx_vlm/models/internvl_chat/processor.py +395 -0
mlx_vlm/models/internvl_chat/vision.py +265 -0
mlx_vlm/models/interpolate.py +183 -0
mlx_vlm/models/jina_vlm/__init__.py +3 -0
mlx_vlm/models/jina_vlm/config.py +142 -0
mlx_vlm/models/jina_vlm/image_processor.py +430 -0
mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
mlx_vlm/models/jina_vlm/language.py +272 -0
mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
mlx_vlm/models/jina_vlm/vision.py +202 -0
mlx_vlm/models/kernels.py +447 -0
mlx_vlm/models/kimi_vl/__init__.py +4 -0
mlx_vlm/models/kimi_vl/config.py +84 -0
mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
mlx_vlm/models/kimi_vl/language.py +460 -0
mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
mlx_vlm/models/kimi_vl/vision.py +485 -0
mlx_vlm/models/lfm2_vl/__init__.py +2 -0
mlx_vlm/models/lfm2_vl/config.py +94 -0
mlx_vlm/models/lfm2_vl/language.py +49 -0
mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
mlx_vlm/models/lfm2_vl/vision.py +223 -0
mlx_vlm/models/llama4/__init__.py +2 -0
mlx_vlm/models/llama4/config.py +83 -0
mlx_vlm/models/llama4/language.py +334 -0
mlx_vlm/models/llama4/llama4.py +146 -0
mlx_vlm/models/llama4/vision.py +526 -0
mlx_vlm/models/llava/__init__.py +2 -0
mlx_vlm/models/llava/config.py +61 -0
mlx_vlm/models/llava/language.py +200 -0
mlx_vlm/models/llava/llava.py +132 -0
mlx_vlm/models/llava/vision.py +233 -0
mlx_vlm/models/llava_bunny/__init__.py +2 -0
mlx_vlm/models/llava_bunny/config.py +85 -0
mlx_vlm/models/llava_bunny/language.py +194 -0
mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
mlx_vlm/models/llava_bunny/vision.py +278 -0
mlx_vlm/models/llava_next/__init__.py +2 -0
mlx_vlm/models/llava_next/config.py +60 -0
mlx_vlm/models/llava_next/language.py +192 -0
mlx_vlm/models/llava_next/llava_next.py +138 -0
mlx_vlm/models/llava_next/vision.py +217 -0
mlx_vlm/models/mistral3/__init__.py +2 -0
mlx_vlm/models/mistral3/config.py +59 -0
mlx_vlm/models/mistral3/language.py +269 -0
mlx_vlm/models/mistral3/mistral3.py +383 -0
mlx_vlm/models/mllama/__init__.py +4 -0
mlx_vlm/models/mllama/config.py +74 -0
mlx_vlm/models/mllama/language.py +377 -0
mlx_vlm/models/mllama/mllama.py +210 -0
mlx_vlm/models/mllama/vision.py +458 -0
mlx_vlm/models/molmo/__init__.py +5 -0
mlx_vlm/models/molmo/config.py +93 -0
mlx_vlm/models/molmo/language.py +208 -0
mlx_vlm/models/molmo/molmo.py +108 -0
mlx_vlm/models/molmo/processing_molmo.py +763 -0
mlx_vlm/models/molmo/vision.py +408 -0
mlx_vlm/models/molmo2/__init__.py +6 -0
mlx_vlm/models/molmo2/config.py +137 -0
mlx_vlm/models/molmo2/language.py +206 -0
mlx_vlm/models/molmo2/molmo2.py +330 -0
mlx_vlm/models/molmo2/processing.py +773 -0
mlx_vlm/models/molmo2/vision.py +286 -0
mlx_vlm/models/moondream2/__init__.py +11 -0
mlx_vlm/models/moondream2/config.py +92 -0
mlx_vlm/models/moondream2/image_crops.py +269 -0
mlx_vlm/models/moondream2/language.py +267 -0
mlx_vlm/models/moondream2/moondream2.py +522 -0
mlx_vlm/models/moondream2/processing_moondream.py +144 -0
mlx_vlm/models/moondream2/vision.py +200 -0
mlx_vlm/models/multi_modality/__init__.py +4 -0
mlx_vlm/models/multi_modality/config.py +108 -0
mlx_vlm/models/multi_modality/language.py +191 -0
mlx_vlm/models/multi_modality/multi_modality.py +338 -0
mlx_vlm/models/multi_modality/sam.py +543 -0
mlx_vlm/models/multi_modality/vision.py +450 -0
mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
mlx_vlm/models/paddleocr_vl/config.py +93 -0
mlx_vlm/models/paddleocr_vl/language.py +522 -0
mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
mlx_vlm/models/paddleocr_vl/vision.py +358 -0
mlx_vlm/models/paligemma/__init__.py +4 -0
mlx_vlm/models/paligemma/config.py +50 -0
mlx_vlm/models/paligemma/language.py +253 -0
mlx_vlm/models/paligemma/paligemma.py +140 -0
mlx_vlm/models/paligemma/vision.py +218 -0
mlx_vlm/models/phi3_v/__init__.py +5 -0
mlx_vlm/models/phi3_v/config.py +55 -0
mlx_vlm/models/phi3_v/language.py +2 -0
mlx_vlm/models/phi3_v/phi3_v.py +239 -0
mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
mlx_vlm/models/phi3_v/vision.py +294 -0
mlx_vlm/models/pixtral/__init__.py +4 -0
mlx_vlm/models/pixtral/config.py +69 -0
mlx_vlm/models/pixtral/language.py +195 -0
mlx_vlm/models/pixtral/pixtral.py +208 -0
mlx_vlm/models/pixtral/vision.py +293 -0
mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_5_vl/config.py +90 -0
mlx_vlm/models/qwen2_5_vl/language.py +541 -0
mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
mlx_vlm/models/qwen2_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_vl/config.py +86 -0
mlx_vlm/models/qwen2_vl/language.py +539 -0
mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
mlx_vlm/models/qwen2_vl/vision.py +308 -0
mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
mlx_vlm/models/qwen3_vl/__init__.py +2 -0
mlx_vlm/models/qwen3_vl/config.py +103 -0
mlx_vlm/models/qwen3_vl/language.py +596 -0
mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
mlx_vlm/models/qwen3_vl/vision.py +441 -0
mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
mlx_vlm/models/smolvlm/__init__.py +4 -0
mlx_vlm/models/smolvlm/config.py +59 -0
mlx_vlm/models/smolvlm/smolvlm.py +60 -0
mlx_vlm/prompt_utils.py +565 -0
mlx_vlm/sample_utils.py +39 -0
mlx_vlm/server.py +1107 -0
mlx_vlm/smolvlm_video_generate.py +109 -0
mlx_vlm/tokenizer_utils.py +371 -0
mlx_vlm/trainer/__init__.py +9 -0
mlx_vlm/trainer/lora.py +70 -0
mlx_vlm/trainer/trainer.py +299 -0
mlx_vlm/trainer/utils.py +160 -0
mlx_vlm/utils.py +1339 -0
mlx_vlm/version.py +1 -0
mlx_vlm/video_generate.py +611 -0

mlx_vlm/models/paddleocr_vl/paddleocr_vl.py ADDED Viewed

@@ -0,0 +1,207 @@
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+from ..base import InputEmbeddingsFeatures, install_auto_processor_patch
+from .config import ModelConfig
+from .language import LanguageModel
+from .processing_paddleocr_vl import PaddleOCRVLProcessor
+from .vision import VisionModel
+install_auto_processor_patch("paddleocr_vl", PaddleOCRVLProcessor)
+class Model(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        self.visual = VisionModel(config.vision_config)
+        self.language_model = LanguageModel(config.text_config, config)
+    def get_input_embeddings(
+        self,
+        input_ids: Optional[mx.array] = None,
+        pixel_values: Optional[mx.array] = None,
+        **kwargs,
+    ):
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+        mask = kwargs.pop("mask", None)
+        grid_thw = image_grid_thw if image_grid_thw is not None else video_grid_thw
+        if pixel_values is None:
+            # Reset position state for text-only generation
+            self.language_model._position_ids = None
+            self.language_model._rope_deltas = None
+            return InputEmbeddingsFeatures(
+                inputs_embeds=self.language_model.model.embed_tokens(input_ids)
+            )
+        dtype = self.visual.embeddings.patch_embedding.weight.dtype
+        pixel_values = mx.array(pixel_values, dtype=dtype)
+        # Get the input embeddings from the language model
+        inputs_embeds = self.language_model.model.embed_tokens(input_ids)
+        # Get the ouptut hidden states from the vision model
+        hidden_states = self.visual(pixel_values, grid_thw, output_hidden_states=False)
+        # Insert special image tokens in the input_ids
+        final_inputs_embeds = self.merge_input_ids_with_image_features(
+            self.config.image_token_id,
+            hidden_states,
+            inputs_embeds,
+            input_ids,
+        )
+        # Pre-calculate position_ids for chunked prefill
+        if image_grid_thw is not None or video_grid_thw is not None:
+            position_ids, rope_deltas = self.language_model.get_rope_index(
+                input_ids, image_grid_thw, video_grid_thw, mask
+            )
+            self.language_model._position_ids = position_ids
+            self.language_model._rope_deltas = rope_deltas
+        return InputEmbeddingsFeatures(inputs_embeds=final_inputs_embeds)
+    @staticmethod
+    def merge_input_ids_with_image_features(
+        image_token_id,
+        image_features,
+        inputs_embeds,
+        input_ids,
+    ):
+        """Merge image features into input embeddings at image token positions.
+        Args:
+            image_features: Vision features from the vision tower [num_features, hidden_dim]
+            inputs_embeds: Input embeddings [batch_size, seq_len, hidden_dim]
+            input_ids: Input token IDs [batch_size, seq_len]
+        Returns:
+            Updated input embeddings with image features inserted
+        """
+        # Positions of <image> tokens in input_ids
+        image_positions = input_ids == image_token_id
+        # Get dimensions
+        batch_size, seq_len = input_ids.shape
+        # Process each batch item
+        batch_outputs = []
+        feature_start_idx = 0
+        for batch_idx in range(batch_size):
+            # Get mask for this batch
+            image_mask = image_positions[batch_idx]
+            num_positions = mx.sum(image_mask).item()
+            if num_positions > 0:
+                # Extract features for this batch
+                batch_features = image_features[
+                    feature_start_idx : feature_start_idx + num_positions
+                ]
+                # Validate we have the right number of features
+                if batch_features.shape[0] != num_positions:
+                    raise ValueError(
+                        f"Number of image token positions ({num_positions}) does not match "
+                        f"number of image features ({batch_features.shape[0]}) for batch {batch_idx}"
+                    )
+                # Create indices for gathering
+                cumsum = mx.cumsum(image_mask.astype(mx.int32))
+                feature_indices = mx.where(image_mask, cumsum - 1, 0)
+                # Gather features
+                gathered_features = batch_features[feature_indices]
+                # Combine with original embeddings
+                image_mask_expanded = mx.expand_dims(image_mask, axis=-1)
+                batch_output = mx.where(
+                    image_mask_expanded, gathered_features, inputs_embeds[batch_idx]
+                )
+                feature_start_idx += num_positions
+            else:
+                # No image tokens in this batch item
+                batch_output = inputs_embeds[batch_idx]
+            batch_outputs.append(batch_output)
+        # Stack all batch outputs
+        return mx.stack(batch_outputs, axis=0)
+    @property
+    def layers(self):
+        return self.language_model.model.layers
+    def __call__(
+        self,
+        input_ids: mx.array,
+        pixel_values: Optional[mx.array] = None,
+        mask: Optional[mx.array] = None,
+        cache=None,
+        **kwargs,
+    ):
+        input_embeddings_features = self.get_input_embeddings(
+            input_ids, pixel_values, **kwargs
+        )
+        kwargs = {
+            "pixel_values": pixel_values,
+            **kwargs,
+        }
+        logits = self.language_model(
+            input_ids,
+            input_embeddings_features.inputs_embeds,
+            mask=mask,
+            cache=cache,
+            **kwargs,
+        )
+        return logits
+    def sanitize(self, weights):
+        _keys_to_ignore_on_load_unexpected = [
+            "packing_position_embedding",
+            "vision_model.head",
+        ]
+        def transform_key(key):
+            if "visual.vision_model" in key:
+                if "embeddings" in key or "post_layernorm" in key:
+                    key = key.replace("visual.vision_model", "visual")
+                elif "encoder" in key:
+                    key = key.replace("visual.vision_model.encoder", "visual")
+            elif "mlp_AR" in key:
+                key = key.replace("mlp_AR", "visual.projector")
+            elif "model" in key:
+                key = key.replace("model", "language_model.model")
+            elif "lm_head" in key:
+                key = key.replace("lm_head", "language_model.lm_head")
+            return key
+        new_weights = {}
+        for k, v in weights.items():
+            if (
+                "packing_position_embedding" in k
+                or "vision_model.head" in k
+                or ("visual" in k and "k_proj" in k)
+                or ("visual" in k and "v_proj" in k)
+            ):
+                continue
+            elif "visual" in k and "q_proj" in k:
+                new_key = transform_key(k)
+                k_proj = weights.get(k.replace("q_proj", "k_proj"), None)
+                v_proj = weights.get(k.replace("q_proj", "v_proj"), None)
+                if k_proj is not None and v_proj is not None:
+                    merged_tensor = mx.concatenate([v, k_proj, v_proj], axis=0)
+                    merged_key = new_key.replace("q_proj", "qkv")
+                    new_weights[merged_key] = merged_tensor
+            else:
+                new_weights[transform_key(k)] = v
+        return new_weights

mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py ADDED Viewed

@@ -0,0 +1,425 @@
+import json
+import math
+from pathlib import Path
+from typing import List, Optional, Union
+import numpy as np
+from transformers import AutoTokenizer
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_transforms import convert_to_rgb
+from transformers.image_utils import (
+    ImageInput,
+    PILImageResampling,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from transformers.processing_utils import ProcessorMixin
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int,
+    min_pixels: int,
+    max_pixels: int,
+):
+    if height < factor:
+        width = round((width * factor) / height)
+        height = factor
+    if width < factor:
+        height = round((height * factor) / width)
+        width = factor
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+class ImageProcessor(BaseImageProcessor):
+    """
+    MLX-native image processor for PaddleOCRVL that doesn't require torch.
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: dict[str, int] | None = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: int | float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 147384,
+        max_pixels: int = 2822400,
+        patch_size: int = 14,
+        temporal_patch_size: int = 1,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        if size is not None:
+            if "shortest_edge" not in size or "longest_edge" not in size:
+                raise ValueError(
+                    "size must contain 'shortest_edge' and 'longest_edge' keys."
+                )
+        else:
+            size = {"shortest_edge": 147384, "longest_edge": 2822400}
+        # backward compatibility: override size with min_pixels and max_pixels if they are provided
+        if min_pixels is not None:
+            size["shortest_edge"] = min_pixels
+        if max_pixels is not None:
+            size["longest_edge"] = max_pixels
+        self.min_pixels = size["shortest_edge"]
+        self.max_pixels = size["longest_edge"]
+        self.size = size
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.5, 0.5, 0.5]
+        self.image_std = image_std if image_std is not None else [0.5, 0.5, 0.5]
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.do_convert_rgb = do_convert_rgb
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        patch_size: Optional[int] = None,
+        temporal_patch_size: Optional[int] = None,
+        merge_size: Optional[int] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors: Optional[str] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+        if size is not None:
+            if "shortest_edge" not in size or "longest_edge" not in size:
+                raise ValueError(
+                    "size must contain 'shortest_edge' and 'longest_edge' keys."
+                )
+        elif min_pixels is not None and max_pixels is not None:
+            # backward compatibility: override size with min_pixels and max_pixels if they are provided
+            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = (
+            rescale_factor if rescale_factor is not None else self.rescale_factor
+        )
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        temporal_patch_size = (
+            temporal_patch_size
+            if temporal_patch_size is not None
+            else self.temporal_patch_size
+        )
+        merge_size = merge_size if merge_size is not None else self.merge_size
+        do_convert_rgb = (
+            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        )
+        if images is not None:
+            images = make_flat_list_of_images(images)
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        data = {}
+        pixel_values, vision_grid_thws = [], []
+        if images is not None:
+            processed_images = []
+            for image in images:
+                width, height = image.size
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=patch_size * merge_size,
+                    min_pixels=min_pixels,
+                    max_pixels=max_pixels,
+                )
+                image = image.resize((resized_width, resized_height), resample)
+                img_array = to_numpy_array(image)
+                if do_rescale:
+                    img_array = img_array / 255.0
+                if do_normalize:
+                    mean = np.array(self.image_mean).reshape(1, 1, 3)
+                    std = np.array(self.image_std).reshape(1, 1, 3)
+                    img_array = (img_array - mean) / std
+                processed_images.append(img_array)
+            patches = np.array(processed_images)
+            if patches.shape[1] > 3:
+                patches = patches.transpose(0, 3, 1, 2)
+            if patches.shape[0] == 1:
+                patches = np.tile(patches, (temporal_patch_size, 1, 1, 1))
+            channel = patches.shape[1]
+            grid_t = patches.shape[0] // temporal_patch_size
+            grid_h, grid_w = (
+                resized_height // patch_size,
+                resized_width // patch_size,
+            )
+            patches = patches.reshape(
+                grid_t,
+                temporal_patch_size,
+                channel,
+                grid_h,
+                patch_size,
+                grid_w,
+                patch_size,
+            )
+            patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
+            if temporal_patch_size != 1:
+                raise ValueError(
+                    f"temporal_patch_size must be 1!, but got {temporal_patch_size}!"
+                )
+            flatten_patches = patches.reshape(
+                grid_t * grid_h * grid_w, channel, patch_size, patch_size
+            )
+            image_grid_thw = (grid_t, grid_h, grid_w)
+            pixel_values.extend(flatten_patches)
+            vision_grid_thws.append(image_grid_thw)
+        pixel_values = np.array([pixel_values])
+        vision_grid_thws = np.array(vision_grid_thws)
+        data.update({"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws})
+        return BatchFeature(data, tensor_type=return_tensors)
+class PaddleOCRVLProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        **kwargs,
+    ):
+        if image_processor is None:
+            image_processor = ImageProcessor(**kwargs)
+        self.tokenizer = tokenizer
+        self.image_token = (
+            "<|IMAGE_PLACEHOLDER|>"
+            if not hasattr(tokenizer, "image_token")
+            else tokenizer.image_token
+        )
+        self.image_processor = image_processor
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+    def __call__(
+        self,
+        images=None,
+        text: Union[str, List[str]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """Process images and text for the model.
+        Args:
+            images: Single image or list of images
+            text: Single text or list of texts
+            videos: Video inputs (not currently supported)
+            **kwargs: Additional arguments passed to tokenizer
+        Returns:
+            BatchFeature with:
+                - input_ids: Token IDs with image placeholders replaced
+                - attention_mask: Attention mask
+                - pixel_values: Processed image patches
+                - image_grid_thw: Grid dimensions for each image
+                - position_ids: 4D position IDs for xdrope
+        """
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images=images)
+            image_grid_thw = image_inputs["image_grid_thw"]
+        if text is None:
+            text = [""]
+        elif not isinstance(text, list):
+            text = [text]
+        text = [t for t in text]  # Copy to avoid modifying original
+        if images is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.image_token,
+                        "<|placeholder|>"
+                        * (
+                            image_grid_thw[index].prod()
+                            // self.image_processor.merge_size
+                            // self.image_processor.merge_size
+                        ),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+        # Pop return_tensors to handle it ourselves at the end
+        return_tensors = kwargs.pop("return_tensors", None)
+        # Tokenize text
+        text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
+        # Get input_ids and convert to numpy array for processing
+        input_ids = text_inputs["input_ids"]
+        if hasattr(input_ids, "tolist"):
+            # Handle mlx arrays or torch tensors
+            input_ids = np.array(input_ids.tolist())
+        elif isinstance(input_ids, list):
+            input_ids = np.array(input_ids)
+        return BatchFeature(
+            data={**text_inputs, **image_inputs},
+            tensor_type=return_tensors,
+        )
+    def batch_decode(self, *args, **kwargs):
+        """Decode token IDs to text."""
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """Decode token IDs to text."""
+        return self.tokenizer.decode(*args, **kwargs)
+    def apply_chat_template(self, *args, **kwargs):
+        """Apply chat template using the tokenizer."""
+        return self.tokenizer.apply_chat_template(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        """Return combined input names from tokenizer and image processor."""
+        tokenizer_input_names = (
+            self.tokenizer.model_input_names if self.tokenizer else []
+        )
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        """Load processor from pretrained model path."""
+        import warnings
+        from huggingface_hub import hf_hub_download
+        trust_remote_code = kwargs.pop("trust_remote_code", True)
+        model_path = Path(pretrained_model_name_or_path)
+        is_local = model_path.exists() and model_path.is_dir()
+        # Suppress warning about mrope_section in rope_parameters
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore", message="Unrecognized keys in `rope_parameters`"
+            )
+            tokenizer = AutoTokenizer.from_pretrained(
+                str(model_path) if is_local else pretrained_model_name_or_path,
+                trust_remote_code=trust_remote_code,
+                local_files_only=is_local,
+                **kwargs,
+            )
+        # Load image processor config from preprocessor_config.json
+        image_processor_config = {}
+        try:
+            if is_local:
+                config_path = model_path / "preprocessor_config.json"
+            else:
+                config_path = Path(
+                    hf_hub_download(
+                        pretrained_model_name_or_path, "preprocessor_config.json"
+                    )
+                )
+            if config_path.exists():
+                with open(config_path, "r", encoding="utf-8") as f:
+                    preprocessor_config = json.load(f)
+                # Extract relevant image processor parameters
+                relevant_keys = [
+                    "min_pixels",
+                    "max_pixels",
+                    "patch_size",
+                    "temporal_patch_size",
+                    "merge_size",
+                    "image_mean",
+                    "image_std",
+                    "do_resize",
+                    "do_rescale",
+                    "do_normalize",
+                    "do_convert_rgb",
+                ]
+                for key in relevant_keys:
+                    if key in preprocessor_config:
+                        image_processor_config[key] = preprocessor_config[key]
+        except Exception:
+            pass
+        image_processor = ImageProcessor(**image_processor_config)
+        return cls(image_processor=image_processor, tokenizer=tokenizer, **kwargs)
+__all__ = [
+    "PaddleOCRVLProcessor",
+    "ImageProcessor",
+]