PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/glm_image/processing_glm_image.py CHANGED Viewed

@@ -72,6 +72,8 @@ class GlmImageProcessor(ProcessorMixin):
             in a chat into a tokenizable string.
     """
+    model_input_names = ["input_ids", "attention_mask", "pixel_values", "image_grid_thw", "images_per_sample"]
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         self.image_token = tokenizer.image_token
         self.grid_bos_token = tokenizer.grid_bos_token
@@ -119,6 +121,7 @@ class GlmImageProcessor(ProcessorMixin):
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
         target_h = output_kwargs["images_kwargs"].pop("target_h", None)
         target_w = output_kwargs["images_kwargs"].pop("target_w", None)
         is_text_to_image = images is None
@@ -130,16 +133,27 @@ class GlmImageProcessor(ProcessorMixin):
             image_inputs = {}
             image_grid_thw = None
+        # Handle text=None case (image-only processing)
+        if text is None:
+            if images is None:
+                raise ValueError("You must provide at least one of `text` or `images`.")
+            return image_inputs
         if not isinstance(text, list):
             text = [text]
-        if len(text) > 1:
-            raise ValueError("The model does not support batch size > 1")
+        batch_size = len(text)
         text = text.copy()  # below lines change text in-place
+        # Count images per sample by counting image tokens in each text
+        images_per_sample = []
+        for i in range(batch_size):
+            images_per_sample.append(text[i].count(self.image_token))
+        # Replace image tokens with the correct number of placeholder tokens
         if not is_text_to_image:
             index = 0
-            for i in range(len(text)):
+            for i in range(batch_size):
                 while self.image_token in text[i]:
                     grid = image_grid_thw[index]
                     num_image_tokens = int(grid[1] * grid[2])
@@ -147,20 +161,50 @@ class GlmImageProcessor(ProcessorMixin):
                     index += 1
                 text[i] = text[i].replace("<|placeholder|>", self.image_token)
-        text[0], token_h, token_w, prev_h, prev_w = self._build_prompt_with_target_shape(
-            text[0], height=target_h, width=target_w, is_text_to_image=is_text_to_image
-        )
-        image_inputs["image_grid_thw"] = self._build_target_image_grid_thw(
-            token_h=token_h,
-            token_w=token_w,
-            prev_token_h=prev_h,
-            prev_token_w=prev_w,
-            image_grid_thw=image_grid_thw if not is_text_to_image else None,
+        # Build prompt with target shape and combine grids in a single loop
+        # Format: [sample0_source_grids..., sample0_target_grids, sample1_source_grids..., sample1_target_grids, ...]
+        # Note: In i2i mode, batches are homogeneous (same number of source images per sample)
+        num_source_images = images_per_sample[0] if images_per_sample else 0
+        # Validate homogeneity for i2i mode
+        if not is_text_to_image and images_per_sample and len(set(images_per_sample)) != 1:
+            raise ValueError(
+                f"In image-to-image mode, all samples must have the same number of source images. "
+                f"Got different counts: {images_per_sample}"
+            )
+        all_grids = []
+        for i in range(batch_size):
+            text[i], token_h, token_w, prev_h, prev_w = self._build_prompt_with_target_shape(
+                text[i], height=target_h, width=target_w, is_text_to_image=is_text_to_image
+            )
+            # Add source grids for this sample (i2i mode only)
+            if not is_text_to_image and num_source_images > 0:
+                start_idx = i * num_source_images
+                all_grids.append(image_grid_thw[start_idx : start_idx + num_source_images])
+            # Add target grid for this sample
+            all_grids.append(
+                self._build_target_image_grid_thw(
+                    token_h=token_h,
+                    token_w=token_w,
+                    prev_token_h=prev_h,
+                    prev_token_w=prev_w,
+                    is_text_to_image=is_text_to_image,
+                )
+            )
+        image_inputs["image_grid_thw"] = torch.cat(all_grids, dim=0)
+        # Store images_per_sample for later use (add target images count)
+        # Each sample will have: source_images + target_images (typically 2 for t2i, 1 for i2i)
+        num_target_grids = 2 if is_text_to_image else 1
+        image_inputs["images_per_sample"] = torch.tensor(
+            [num_source_images + num_target_grids] * batch_size, dtype=torch.long
         )
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
         text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
         self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
         if return_mm_token_type_ids:
@@ -199,9 +243,10 @@ class GlmImageProcessor(ProcessorMixin):
         token_w: int,
         prev_token_h: int,
         prev_token_w: int,
-        image_grid_thw: None,
+        is_text_to_image: bool = True,
     ):
-        if image_grid_thw is None:
+        if is_text_to_image:
+            # Text-to-image: 2 target grids (large + small preview)
             return torch.tensor(
                 [
                     [1, token_h, token_w],
@@ -209,8 +254,11 @@ class GlmImageProcessor(ProcessorMixin):
                 ],
             )
         else:
-            return torch.cat(
-                [image_grid_thw, torch.tensor([[1, token_h, token_w]], device=image_grid_thw.device)], dim=0
+            # Image-to-image: 1 target grid only
+            return torch.tensor(
+                [
+                    [1, token_h, token_w],
+                ],
             )

transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Inc. team.
+# Copyright 2026 the HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,4 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_glm_ocr import *
+    from .modeling_glm_ocr import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

transformers/models/glm_ocr/configuration_glm_ocr.py ADDED Viewed

@@ -0,0 +1,312 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/glm_ocr/modular_glm_ocr.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_glm_ocr.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_utils import PreTrainedConfig
+from ...modeling_rope_utils import RopeParameters
+class GlmOcrVisionConfig(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GlmOcrVisionConfig`]. It is used to instantiate a
+    GLM-OCR model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of
+    GLM-OCR [zai-org/GLM-OCR](https://huggingface.co/zai-org/GLM-OCR).
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+    Args:
+        depth (`int`, *optional*, defaults to 24):
+            Number of layers (depth) in the model.
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"silu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        attention_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for attention weights.
+        num_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer architecture.
+        in_channels (`int`, *optional*, defaults to 3):
+            Number of input channels.
+        image_size (`int` or `list[int]`, *optional*, defaults to 336):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        spatial_merge_size (`int`, *optional*, defaults to 2):
+            The size used for merging spatial dimensions.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The size used for patches along the temporal dimension.
+        out_hidden_size (`int`, *optional*, defaults to 1536):
+            The output hidden size of the vision model.
+        intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    """
+    model_type = "glm_ocr_vision"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        depth=24,
+        hidden_size=1024,
+        hidden_act="silu",
+        attention_bias=True,
+        attention_dropout=0.0,
+        num_heads=16,
+        in_channels=3,
+        image_size=336,
+        patch_size=14,
+        rms_norm_eps=1e-05,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        out_hidden_size=1536,
+        intermediate_size=4096,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.out_hidden_size = out_hidden_size
+        self.intermediate_size = intermediate_size
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+class GlmOcrTextConfig(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GlmOcrTextConfig`]. It is used to instantiate a
+    GLM-OCR model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of
+    GLM-OCR [zai-org/GLM-OCR](https://huggingface.co/zai-org/GLM-OCR).
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 59392):
+            Vocabulary size of the GlmOcr model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GlmOcrModel`]
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 16):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        rope_parameters (`RopeParameters`, *optional*):
+            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
+            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
+            with longer `max_position_embeddings`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+    ```python
+    >>> from transformers import GlmOcrTextModel, GlmOcrConfig
+    >>> # Initializing a GLM-OCR style configuration
+    >>> configuration = GlmOcrConfig()
+    >>> # Initializing a model from the GLM-OCR style configuration
+    >>> model = GlmOcrTextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "glm_ocr_text"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `GlmOcr`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_up_proj": "colwise_gather_output",  # we need to replicate here due to the `chunk` operation
+        "layers.*.mlp.down_proj": "rowwise_split_input",  # input is replicated due to the `chunk` operation
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size: int | None = 59392,
+        hidden_size: int | None = 1024,
+        intermediate_size: int | None = 4096,
+        num_hidden_layers: int | None = 16,
+        num_attention_heads: int | None = 16,
+        num_key_value_heads: int | None = 8,
+        hidden_act: str | None = "silu",
+        max_position_embeddings: int | None = 131072,
+        initializer_range: float | None = 0.02,
+        rms_norm_eps: int | None = 1e-05,
+        use_cache: bool | None = True,
+        attention_dropout: float | None = 0.0,
+        rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
+        pad_token_id: int | None = None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.attention_dropout = attention_dropout
+        self.rope_parameters = rope_parameters
+        self.pad_token_id = pad_token_id
+        super().__init__(ignore_keys_at_rope_validation={"mrope_section"}, **kwargs)
+class GlmOcrConfig(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GlmOcrModel`]. It is used to instantiate a
+    GLM-OCR model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of
+    GLM-OCR [zai-org/GLM-OCR](https://huggingface.co/zai-org/GLM-OCR).
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `GlmOcrTextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `GlmOcrVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 59280):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 59281):
+            The video token index to encode the image prompt.
+        image_start_token_id (`int`, *optional*, defaults to 59256):
+            The image start token index to encode the start of image.
+        image_end_token_id (`int`, *optional*, defaults to 59257):
+            The image end token index to encode the end of image.
+        video_start_token_id (`int`, *optional*, defaults to 59258):
+            The video start token index to encode the start of video.
+        video_end_token_id (`int`, *optional*, defaults to 59259):
+            The video end token index to encode the end of video.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+    ```python
+    >>> from transformers import GlmOcrForConditionalGeneration, GlmOcrConfig
+    >>> # Initializing a GLM-OCR style configuration
+    >>> configuration = GlmOcrConfig()
+    >>> # Initializing a model from the GLM-OCR style configuration
+    >>> model = GlmOcrForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "glm_ocr"
+    sub_configs = {"vision_config": GlmOcrVisionConfig, "text_config": GlmOcrTextConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=59280,
+        video_token_id=59281,
+        image_start_token_id=59256,
+        image_end_token_id=59257,
+        video_start_token_id=59258,
+        video_end_token_id=59259,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.video_start_token_id = video_start_token_id
+        self.video_end_token_id = video_end_token_id
+        self.image_start_token_id = image_start_token_id
+        self.image_end_token_id = image_end_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        super().__init__(**kwargs)
+__all__ = ["GlmOcrConfig", "GlmOcrTextConfig", "GlmOcrVisionConfig"]

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl