PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/glm_image/modeling_glm_image.py CHANGED Viewed

@@ -32,7 +32,7 @@ from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
 from ...masking_utils import create_causal_mask
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
-from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, ModelOutput
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
@@ -126,9 +126,9 @@ class GlmImageVisionAttention(nn.Module):
         key_states = key_states.transpose(0, 1).unsqueeze(0)
         value_states = value_states.transpose(0, 1).unsqueeze(0)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         if "flash" in self.config._attn_implementation:
             # Flash Attention: Use cu_seqlens for variable length attention
@@ -402,9 +402,9 @@ class GlmImageTextAttention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -612,6 +612,23 @@ class GlmImageVQVAEVectorQuantizer(nn.Module):
         return hidden_state_quant, loss, min_encoding_indices
+@dataclass
+@auto_docstring
+class GlmImageVQVAEModelOutput(BaseModelOutputWithPooling):
+    r"""
+    quantized_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+        Quantized last hidden state from the VQ-VAE model.
+    image_tokens (`torch.FloatTensor` of shape `(batch_size, config.vocab_size`):
+        Indices of the image tokens predicted by the VQ-VAE model.
+    embedding_loss (`torch.FloatTensor`):
+        The embedding loss computed during quantization.
+    """
+    quantized_last_hidden_state: torch.FloatTensor | None = None
+    image_tokens: torch.FloatTensor | None = None
+    embedding_loss: torch.FloatTensor | None = None
 @auto_docstring(
     custom_intro="""
     The VQ-VAE model used in GlmImage for encoding/decoding images into discrete tokens.
@@ -625,6 +642,7 @@ class GlmImageVQVAE(GlmImagePreTrainedModel):
     _no_split_modules = [
         "GlmImageVQVAEVectorQuantizer",
     ]
+    _can_record_outputs = {}
     def __init__(self, config: GlmImageVQVAEConfig):
         super().__init__(config)
@@ -634,16 +652,26 @@ class GlmImageVQVAE(GlmImagePreTrainedModel):
         self.eval()  # GlmImage's VQ model is frozen
         self.post_init()
-    def encode(self, hidden_states):
-        hidden_states = self.quant_conv(hidden_states)
-        quant, emb_loss, indices = self.quantize(hidden_states)
-        return quant, emb_loss, indices
+    @check_model_inputs
+    def encode(self, hidden_states) -> GlmImageVQVAEModelOutput:
+        conv_hidden_states = self.quant_conv(hidden_states)
+        quantized_last_hidden_state, emb_loss, indices = self.quantize(conv_hidden_states)
+        return GlmImageVQVAEModelOutput(
+            last_hidden_state=hidden_states,
+            quantized_last_hidden_state=quantized_last_hidden_state,
+            image_tokens=indices,
+            embedding_loss=emb_loss,
+        )
 class GlmImageVisionModel(GlmImagePreTrainedModel):
     config: GlmImageVisionConfig
     input_modalities = ("image",)
     _no_split_modules = ["GlmImageVisionBlock"]
+    _can_record_outputs = {
+        "hidden_states": GlmImageVisionBlock,
+        "attentions": GlmImageVisionAttention,
+    }
     main_input_name = "pixel_values"
     def __init__(self, config: GlmImageVisionConfig) -> None:
@@ -688,13 +716,16 @@ class GlmImageVisionModel(GlmImagePreTrainedModel):
         pos_ids = torch.cat(pos_ids, dim=0)
         return pos_ids
-    def forward(self, pixel_values: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
-        """
-        Args:
-            pixel_values (`torch.Tensor` of shape `(total_patches, num_channels * patch_size * patch_size)`):
-                Packed pixel values.
-            grid_thw (`torch.Tensor` of shape `(num_images, 3)`):
-                The temporal, height and width of feature shape of each image.
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self, pixel_values: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
+    ) -> tuple | BaseModelOutputWithPooling:
+        r"""
+        pixel_values (`torch.Tensor` of shape `(total_patches, num_channels * patch_size * patch_size)`):
+            Packed pixel values.
+        grid_thw (`torch.Tensor` of shape `(num_images, 3)`):
+            The temporal, height and width of feature shape of each image.
         Returns:
             `torch.Tensor` of shape `(total_patches, hidden_size)`: Hidden states.
@@ -723,7 +754,8 @@ class GlmImageVisionModel(GlmImagePreTrainedModel):
                 hidden_states,
                 cu_seqlens=cu_seqlens,
             )
-        return hidden_states
+        return BaseModelOutputWithPooling(last_hidden_state=hidden_states)
 class GlmImageTextRotaryEmbedding(nn.Module):
@@ -927,6 +959,10 @@ class GlmImageModel(GlmImagePreTrainedModel):
         self.rope_deltas = None  # cache rope_deltas here
         self.vqmodel = GlmImageVQVAE._from_config(config.vq_config)
+        # Per-sample caches for batch processing
+        self._cached_decode_position_ids = None  # shape: [batch_size, 3, max_decode_len]
+        self._prefill_len = None  # prefill sequence length (same for all samples in batch)
         # Initialize weights and apply final processing
         self.post_init()
@@ -940,220 +976,169 @@ class GlmImageModel(GlmImagePreTrainedModel):
         self,
         input_ids: torch.LongTensor | None = None,
         image_grid_thw: torch.LongTensor | None = None,
+        images_per_sample: torch.LongTensor | None = None,
         attention_mask: torch.LongTensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
-        Calculate the 3D rope index for image generation task.
-        Explanation:
-            Each embedding sequence may contain image tokens (for generation) and text tokens,
-            or just text tokens.
-            Input format:
-                - Text-to-Image: [text tokens] + <|dit_token_16384|>
-                - Image-to-Image: <|dit_token_16384|> [image tokens] <|dit_token_16385|> + [text tokens] + <|dit_token_16384|>
-            For pure text embedding sequence, the rotary position embedding is the same across all 3 dimensions.
-            Examples:
-                input_ids: [T T T T T], here T is for text.
-                temporal position_ids: [0, 1, 2, 3, 4]
-                height position_ids: [0, 1, 2, 3, 4]
-                width position_ids: [0, 1, 2, 3, 4]
-            For sequences with image tokens, we use special markers to denote image regions:
-                - <|dit_token_16384|>: image start marker
-                - <|dit_token_16385|>: image end marker
-                - Image tokens between these markers use 2D spatial position encoding.
-            For image tokens:
-                - temporal: stays constant at (image_start_pos + 1)
-                - height: increments every w tokens, representing row position
-                - width: cycles from 0 to w-1, representing column position
-            After each image region, the next position jumps to: image_start_pos + 1 + max(h, w)
-            This ensures sufficient positional separation between images and subsequent tokens.
-            Examples:
-                === Case 1: Image-to-Image Generation ===
-                Source image with grid [1, 3, 2], followed by text, then generation.
-                input_ids: [<|dit_token_16384|> V V V V V V <|dit_token_16385|> T T T T <|dit_token_16384|>]
-                image_grid_thw: [[1, 3, 2], [1, 4, 4]]  # first is source, second is target
-                For source image (h=3, w=2, 6 tokens):
-                    Start marker at position 0
-                    Image tokens at temporal=1, height=[1,1,2,2,3,3], width=[1,2,1,2,1,2]
-                    End marker at position 4 (= 0 + 1 + max(3,2))
-                Text tokens and trailing start marker continue from position 5.
-                Full prefill position_ids:
-                temporal: [0, 1,1,1,1,1,1, 4, 5,6,7,8, 9]
-                height:   [0, 1,1,2,2,3,3, 4, 5,6,7,8, 9]
-                width:    [0, 1,2,1,2,1,2, 4, 5,6,7,8, 9]
-                Decode stage: use image_grid_thw[-1] = [1, 4, 4] to build cached position_ids,
-                starting from gen_st_idx = 10.
-                === Case 2: Text-to-Image Generation (multi-resolution) ===
-                Pure text input with two image_grids for progressive generation.
-                input_ids: [hello<sop>3 3<eop><sop>3 2<eop><|dit_token_16384|>]
-                Assume "hello<sop>3 3<eop><sop>3 2<eop>" = 4 tokens (positions 0-3)
-                <|dit_token_16384|> at position 4
-                image_grid_thw: [[1, 3, 3], [1, 3, 2]]
-                    - image_grid_thw[-1] = [1, 3, 2]: first generated image (smaller/draft)
-                    - image_grid_thw[-2] = [1, 3, 3]: second generated image (larger/final)
-                Prefill position_ids (5 tokens: 4 text + 1 start marker):
-                temporal: [0, 1, 2, 3, 4]
-                height:   [0, 1, 2, 3, 4]
-                width:    [0, 1, 2, 3, 4]
-                Decode stage builds position_ids in reverse order of image_grid_thw:
-                First: image_grid_thw[-1] = [1, 3, 2] (6 tokens), starting at position 5:
-                temporal: [5, 5, 5, 5, 5, 5]
-                height:   [5, 5, 6, 6, 7, 7]
-                width:    [5, 6, 5, 6, 5, 6]
-                next_pos = 5 + max(3, 2) = 8
-                Then: image_grid_thw[-2] = [1, 3, 3] (9 tokens), starting at position 8:
-                temporal: [8, 8, 8, 8, 8, 8, 8, 8, 8]
-                height:   [8, 8, 8, 9, 9, 9, 10, 10, 10]
-                width:    [8, 9, 10, 8, 9, 10, 8, 9, 10]
-                next_pos = 8 + max(3, 3) = 11
-                Finally: <|dit_token_16385|> end marker at position 11
-                Full sequence position_ids (prefill + decode):
-                temporal: [0,1,2,3, 4, 5,5,5,5,5,5, 8,8,8,8,8,8,8,8,8, 11]
-                height:   [0,1,2,3, 4, 5,5,6,6,7,7, 8,8,8,9,9,9,10,10,10, 11]
-                width:    [0,1,2,3, 4, 5,6,5,6,5,6, 8,9,10,8,9,10,8,9,10, 11]
-                _cached_decode_position_ids shape: [3, 6 + 9 + 1] = [3, 16]
-                (includes all generated image tokens + end marker)
+        Calculate the 3D rope index for image generation task with full batch support.
         Args:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default
-                should you provide it.
-            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
-                The temporal, height and width of feature shape of each image. For image generation,
-                temporal is typically 1.
-                - For image-to-image: includes source image grids + target image grid(s)
-                - For text-to-image with multi-resolution: includes multiple target grids,
-                  processed in reverse order (last grid first, second-to-last grid second, etc.)
+                Indices of input sequence tokens in the vocabulary.
+            image_grid_thw (`torch.LongTensor` of shape `(total_images_in_batch, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image.
+                Images are packed across all samples in the batch.
+            images_per_sample (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Number of images (including target grids) for each sample in the batch.
+                Used to split image_grid_thw by sample.
             attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
+                Mask to avoid performing attention on padding token indices.
         Returns:
             position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`):
                 Position IDs for temporal, height, and width dimensions.
             mrope_position_deltas (`torch.Tensor` of shape `(batch_size, 1)`):
-                Position deltas for multi-modal rotary position embedding (zeros for this task).
+                Position deltas for multi-modal rotary position embedding.
         """
         batch_size, seq_len = input_ids.shape
         device = input_ids.device
         dtype = input_ids.dtype
         image_start_token_id = self.config.image_start_token_id
         image_end_token_id = self.config.image_end_token_id
-        num_complete_images = (input_ids == image_end_token_id).sum().item()
-        position_ids = torch.ones(
-            3, input_ids.shape[0], input_ids.shape[1], dtype=input_ids.dtype, device=input_ids.device
-        )
-        text_positions = torch.arange(seq_len)[None, :].repeat(3, 1)
+        position_ids = torch.ones(3, batch_size, seq_len, dtype=dtype, device=device)
+        text_positions = torch.arange(seq_len, device=device)[None, :].repeat(3, 1)
+        # Split image_grid_thw by sample if images_per_sample is provided
+        if image_grid_thw is not None and images_per_sample is not None:
+            grids_per_sample = torch.split(image_grid_thw, images_per_sample.tolist())
+        elif image_grid_thw is not None:
+            # Fallback: assume all grids belong to first sample (batch_size=1)
+            grids_per_sample = [image_grid_thw] * batch_size
+        else:
+            grids_per_sample = [None] * batch_size
+        # Per-sample caches for decode stage
+        all_decode_position_ids = []
         for batch_idx in range(batch_size):
             curr_input_ids = input_ids[batch_idx]
-            if attention_mask is not None:
-                curr_input_ids = curr_input_ids[attention_mask[batch_idx] == 1]
+            curr_grids = grids_per_sample[batch_idx]
-            image_end = torch.where(curr_input_ids == image_end_token_id)[0]
-            image_start = torch.where(curr_input_ids == image_start_token_id)[0] + 1
-            current_pos = 0  # track the current position value
+            if attention_mask is not None and attention_mask.shape[1] == seq_len:
+                valid_mask = attention_mask[batch_idx] == 1
+                curr_input_ids_valid = curr_input_ids[valid_mask]
+            else:
+                # attention_mask may have different length during assisted decoding
+                curr_input_ids_valid = curr_input_ids
+                valid_mask = None
+            # Find image boundaries in this sample
+            image_end_positions = torch.where(curr_input_ids_valid == image_end_token_id)[0]
+            image_start_positions = torch.where(curr_input_ids_valid == image_start_token_id)[0] + 1
+            num_complete_images = len(image_end_positions)
+            current_pos = 0
             prev_image_end = 0
             curr_position_ids = []
-            for start, end, grid in zip(image_start, image_end, image_grid_thw):
-                _, num_width_grid, num_height_grid = grid
-                # Create text position ids first if there are text tokens before image
+            # Process complete images (source images in image-to-image task)
+            for img_idx, (start, end) in enumerate(zip(image_start_positions, image_end_positions)):
+                if curr_grids is None or img_idx >= len(curr_grids):
+                    break
+                grid = curr_grids[img_idx]
+                # grid format is [temporal, height, width]
+                _, height, width = grid.tolist()
+                # Text tokens before this image
                 llm_pos_length = start - prev_image_end
-                llm_position_ids = text_positions[:, current_pos : current_pos + llm_pos_length].to(
-                    device=input_ids.device
-                )
+                llm_position_ids = text_positions[:, current_pos : current_pos + llm_pos_length].to(device=device)
                 current_pos += llm_position_ids.shape[-1]
-                # Now create image position ids for each grid
-                image_seq_length = num_height_grid * num_width_grid
-                h_grids = image_seq_length // num_height_grid + current_pos
-                w_grids = image_seq_length // num_width_grid + current_pos
-                position_width = torch.arange(current_pos, w_grids, device=input_ids.device).repeat(num_width_grid)
-                position_height = torch.arange(current_pos, h_grids, device=input_ids.device).repeat_interleave(
-                    num_height_grid
-                )
-                position_temporal = torch.full(
-                    (image_seq_length,), current_pos, device=input_ids.device, dtype=torch.long
+                # Image tokens with 2D spatial encoding
+                # For an image with height H and width W:
+                # - position_width cycles [0, 1, ..., W-1] for each row, repeated H times
+                # - position_height stays constant per row, [0]*W, [1]*W, ..., [H-1]*W
+                image_seq_length = height * width
+                position_width = torch.arange(current_pos, current_pos + width, device=device).repeat(height)
+                position_height = torch.arange(current_pos, current_pos + height, device=device).repeat_interleave(
+                    width
                 )
+                position_temporal = torch.full((image_seq_length,), current_pos, device=device, dtype=torch.long)
                 vision_position_ids = torch.stack([position_temporal, position_height, position_width], dim=0)
-                current_pos += max(num_height_grid, num_width_grid)
+                current_pos += max(height, width)
                 prev_image_end = end
                 curr_position_ids.append(torch.cat([llm_position_ids, vision_position_ids], dim=-1))
-            # Add position ids for the last text tokens if any
-            end_position = len(curr_input_ids) - prev_image_end
-            llm_position_ids = text_positions[:, current_pos : current_pos + end_position].to(device=input_ids.device)
+            # Remaining text tokens (including the final image_start token for generation)
+            end_position = len(curr_input_ids_valid) - prev_image_end
+            llm_position_ids = text_positions[:, current_pos : current_pos + end_position].to(device=device)
             current_pos += llm_position_ids.shape[-1]
             curr_position_ids.append(llm_position_ids)
+            # Concatenate all position ids for this sample
             curr_position_ids = torch.cat(curr_position_ids, dim=-1)
-            if attention_mask is not None:
-                position_ids[:, batch_idx, attention_mask[batch_idx] == 1] = curr_position_ids.to(position_ids.device)
+            # Store in the main position_ids tensor
+            if valid_mask is not None:
+                position_ids[:, batch_idx, valid_mask] = curr_position_ids
             else:
-                position_ids[:, batch_idx, :] = curr_position_ids.to(position_ids.device)
+                position_ids[:, batch_idx, :] = curr_position_ids
+            # Build decode position ids for this sample
+            if curr_grids is not None and len(curr_grids) > 0:
+                num_decode_grids = len(curr_grids) - num_complete_images
+                num_decode_grids = max(num_decode_grids, 0)
+                decode_pos = current_pos
+                decode_temporal_list = []
+                decode_height_list = []
+                decode_width_list = []
+                for i in range(1, num_decode_grids + 1):
+                    grid_idx = -i
+                    h = curr_grids[grid_idx, 1].item()
+                    w = curr_grids[grid_idx, 2].item()
+                    total_tokens = h * w
+                    h_indices = torch.arange(h, device=device).unsqueeze(1).expand(h, w).flatten()
+                    w_indices = torch.arange(w, device=device).unsqueeze(0).expand(h, w).flatten()
+                    decode_temporal_list.append(
+                        torch.full((total_tokens,), decode_pos, device=device, dtype=torch.long)
+                    )
+                    decode_height_list.append(decode_pos + h_indices)
+                    decode_width_list.append(decode_pos + w_indices)
+                    decode_pos = decode_pos + max(h, w)
+                # End marker
+                decode_temporal_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
+                decode_height_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
+                decode_width_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
+                sample_decode_pos_ids = torch.stack(
+                    [
+                        torch.cat(decode_temporal_list, dim=0),
+                        torch.cat(decode_height_list, dim=0),
+                        torch.cat(decode_width_list, dim=0),
+                    ],
+                    dim=0,
+                )
+                all_decode_position_ids.append(sample_decode_pos_ids)
-        # Build and store position ids for tokens that will be generated. Later we will just
-        # slice these instead of computing each decoding step
+        # Store prefill length (same for all samples since input_ids is padded to same length)
         self._prefill_len = seq_len
-        if image_grid_thw is not None and len(image_grid_thw) > 0:
-            num_decode_grids = len(image_grid_thw) - num_complete_images
-            num_decode_grids = max(num_decode_grids, 0)
-            decode_pos = current_pos
-            decode_temporal_list = []
-            decode_height_list = []
-            decode_width_list = []
-            for i in range(1, num_decode_grids + 1):
-                grid_idx = -i
-                h = image_grid_thw[grid_idx, 1].item()
-                w = image_grid_thw[grid_idx, 2].item()
-                total_tokens = h * w
-                h_indices = torch.arange(h, device=device).unsqueeze(1).expand(h, w).flatten()
-                w_indices = torch.arange(w, device=device).unsqueeze(0).expand(h, w).flatten()
-                decode_temporal_list.append(torch.full((total_tokens,), decode_pos, device=device, dtype=torch.long))
-                decode_height_list.append(decode_pos + h_indices)
-                decode_width_list.append(decode_pos + w_indices)
-                decode_pos = decode_pos + max(h, w)
-            decode_temporal_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
-            decode_height_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
-            decode_width_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
-            self._cached_decode_position_ids = torch.stack(
-                [
-                    torch.cat(decode_temporal_list, dim=0),
-                    torch.cat(decode_height_list, dim=0),
-                    torch.cat(decode_width_list, dim=0),
-                ],
-                dim=0,
-            )
+        # Pad decode position ids to same length and stack
+        if all_decode_position_ids:
+            max_decode_len = max(x.shape[1] for x in all_decode_position_ids)
+            padded_decode_pos_ids = [
+                F.pad(pos_ids, (0, max_decode_len - pos_ids.shape[1]), mode="replicate")
+                for pos_ids in all_decode_position_ids
+            ]
+            self._cached_decode_position_ids = torch.stack(padded_decode_pos_ids, dim=0)  # [batch, 3, max_decode_len]
         else:
             self._cached_decode_position_ids = None
@@ -1161,21 +1146,27 @@ class GlmImageModel(GlmImagePreTrainedModel):
         return position_ids, mrope_position_deltas
-    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None):
-        """
-        Encodes images into continuous embeddings that can be forwarded to the language model.
-        Args:
-            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
-                The tensors corresponding to the input images.
-            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
-                The temporal, height and width of feature shape of each image in LLM.
+    @can_return_tuple
+    @auto_docstring
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        image_grid_thw: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | BaseModelOutputWithPooling:
+        r"""
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+            The tensors corresponding to the input images.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
         """
         pixel_values = pixel_values.type(self.visual.dtype)
-        image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+        vision_outputs = self.visual(pixel_values, grid_thw=image_grid_thw, return_dict=True, **kwargs)
         split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
-        image_embeds = torch.split(image_embeds, split_sizes)
-        return image_embeds
+        image_embeds = torch.split(vision_outputs.last_hidden_state, split_sizes)
+        vision_outputs.pooler_output = image_embeds
+        return vision_outputs
     def get_placeholder_mask(
         self,
@@ -1219,23 +1210,63 @@ class GlmImageModel(GlmImagePreTrainedModel):
         inputs_embeds: torch.FloatTensor | None = None,
         pixel_values: torch.Tensor | None = None,
         image_grid_thw: torch.LongTensor | None = None,
+        images_per_sample: torch.LongTensor | None = None,
         rope_deltas: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> tuple | GlmImageModelOutputWithPast:
         r"""
-        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+        image_grid_thw (`torch.LongTensor` of shape `(total_images_in_batch, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
+            Images are packed across all samples in the batch.
+        images_per_sample (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Number of images (including target grids) for each sample in the batch.
         rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
             The rope index difference between sequence length and multimodal rope.
         """
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
         if pixel_values is not None:
-            image_embeds = self.get_image_features(pixel_values, image_grid_thw[:-1])
-            image_embeds = torch.cat(image_embeds, dim=0)
-            image_ids = self.get_image_tokens(image_embeds, image_grid_thw[:-1])
+            # Process source images (image-to-image mode)
+            # Source images are identified by counting image_end_token_id in input_ids
+            # Note: We must exclude padding tokens since pad_token_id == image_end_token_id
+            if images_per_sample is not None:
+                grids_per_sample = torch.split(image_grid_thw, images_per_sample.tolist())
+                # Create mask for non-padding tokens (attention_mask=1 means non-padding)
+                # Handle 4D attention mask (from static cache) by extracting diagonal
+                if attention_mask is not None and attention_mask.ndim == 4:
+                    non_pad_mask = torch.diagonal(attention_mask[:, 0], dim1=1, dim2=2)
+                    if non_pad_mask.dtype.is_floating_point:
+                        non_pad_mask = non_pad_mask / torch.finfo(non_pad_mask.dtype).min
+                        non_pad_mask = (1.0 - non_pad_mask).int()
+                    # Only keep columns matching input_ids length
+                    non_pad_mask = non_pad_mask[:, -input_ids.shape[1] :]
+                else:
+                    non_pad_mask = attention_mask if attention_mask is not None else torch.ones_like(input_ids)
+                source_grids_list = []
+                for sample_idx in range(batch_size):
+                    is_image_end = input_ids[sample_idx] == self.config.image_end_token_id
+                    is_non_pad = non_pad_mask[sample_idx] == 1
+                    num_source = (is_image_end & is_non_pad).sum().item()
+                    if num_source > 0:
+                        source_grids_list.append(grids_per_sample[sample_idx][:num_source])
+                if len(source_grids_list) == 0:
+                    raise ValueError(
+                        "pixel_values provided but no source images found in input_ids. "
+                        "Ensure input_ids contains image_end_token_id for each source image."
+                    )
+                source_grids = torch.cat(source_grids_list, dim=0)
+            else:
+                # Fallback for batch_size=1: all but last grid are source images
+                source_grids = image_grid_thw[:-1]
+            image_features = self.get_image_features(pixel_values, source_grids, return_dict=True)
+            image_embeds = torch.cat(image_features.pooler_output, dim=0)
+            image_ids = self.get_image_tokens(image_embeds, source_grids)
             image_ids = image_ids.view(-1).to(input_ids.device)
             special_image_mask = self.get_placeholder_mask(input_ids, image_ids)
             input_ids = input_ids.masked_scatter(special_image_mask, image_ids)
@@ -1253,8 +1284,6 @@ class GlmImageModel(GlmImagePreTrainedModel):
                     attention_mask_2d = (1.0 - attention_mask_2d).int()
             # Calculate RoPE index once per generation in the pre-fill stage only.
-            # It is safe to assume that `length!=1` means we're in pre-fill because the
-            # model is used only by DiT pipeline without assisted decoding, etc. techniques
             is_prefill_stage = (input_ids is not None and input_ids.shape[1] != 1) or (
                 inputs_embeds is not None and inputs_embeds.shape[1] != 1
             )
@@ -1262,17 +1291,27 @@ class GlmImageModel(GlmImagePreTrainedModel):
                 position_ids, rope_deltas = self.get_rope_index(
                     input_ids,
                     image_grid_thw,
+                    images_per_sample=images_per_sample,
                     attention_mask=attention_mask_2d,
                 )
                 self.rope_deltas = rope_deltas
             # then use the prev pre-calculated rope-deltas to get the correct position ids
             else:
                 batch_size, seq_length, _ = inputs_embeds.shape
-                # Use prefill token length, not position value
-                step = cache_position[0].item() - self._prefill_len
-                # Direct lookup - no tensor creation overhead
-                position_ids = self._cached_decode_position_ids[:, step : step + seq_length]
-                position_ids = position_ids.unsqueeze(1).expand(-1, batch_size, -1)
+                # Per-sample decode position lookup
+                # _cached_decode_position_ids shape: [batch_size, 3, max_decode_len]
+                if self._cached_decode_position_ids is not None:
+                    step = cache_position[0].item() - self._prefill_len
+                    # Get position ids for all samples at once, then transpose to [3, batch_size, seq_length]
+                    position_ids = self._cached_decode_position_ids[:, :, step : step + seq_length].permute(1, 0, 2)
+                else:
+                    # Fallback for text-to-image or cases without cached decode positions
+                    # Use simple incremental positions
+                    start_pos = cache_position[0].item()
+                    position_ids = torch.arange(
+                        start_pos, start_pos + seq_length, device=inputs_embeds.device, dtype=torch.long
+                    )
+                    position_ids = position_ids.unsqueeze(0).repeat(3, batch_size, 1)
         outputs = self.language_model(
             input_ids=None,
@@ -1319,8 +1358,8 @@ class GlmImageModel(GlmImagePreTrainedModel):
             grid_t, grid_h, grid_w = image_grid_thw[i].tolist()
             hs = hs.view(grid_t, grid_h, grid_w, hidden_size)
             hs = hs.permute(0, 3, 1, 2).contiguous()
-            _, _, image_toks = self.vqmodel.encode(hs)
-            all_image_toks.append(image_toks)
+            vqmodel_outputs: GlmImageVQVAEModelOutput = self.vqmodel.encode(hs)
+            all_image_toks.append(vqmodel_outputs.image_tokens)
         return torch.cat(all_image_toks, dim=0)
@@ -1369,8 +1408,20 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
         # Initialize weights and apply final processing
         self.post_init()
-    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None):
-        return self.model.get_image_features(pixel_values, image_grid_thw)
+    @auto_docstring
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        image_grid_thw: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | BaseModelOutputWithPooling:
+        r"""
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+            The tensors corresponding to the input images.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        """
+        return self.model.get_image_features(pixel_values, image_grid_thw, **kwargs)
     def get_image_tokens(self, hidden_states: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None):
         return self.model.get_image_tokens(hidden_states, image_grid_thw)
@@ -1385,6 +1436,7 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
         labels: torch.LongTensor | None = None,
         pixel_values: torch.Tensor | None = None,
         image_grid_thw: torch.LongTensor | None = None,
+        images_per_sample: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
@@ -1394,14 +1446,18 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
             config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
             (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+        image_grid_thw (`torch.LongTensor` of shape `(total_images_in_batch, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
+            Images are packed across all samples in the batch.
+        images_per_sample (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Number of images (including target grids) for each sample in the batch.
         Example:
         ```python
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> from transformers import AutoProcessor, GlmImageForConditionalGeneration
         >>> model = GlmImageForConditionalGeneration.from_pretrained("zai-org/GLM-Image")
@@ -1417,7 +1473,8 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
             },
         ]
         >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
@@ -1431,6 +1488,7 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
             input_ids=input_ids,
             pixel_values=pixel_values,
             image_grid_thw=image_grid_thw,
+            images_per_sample=images_per_sample,
             position_ids=position_ids,
             attention_mask=attention_mask,
             past_key_values=past_key_values,
@@ -1469,6 +1527,7 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
         use_cache=True,
         pixel_values=None,
         image_grid_thw=None,
+        images_per_sample=None,
         is_first_iteration=False,
         **kwargs,
     ):
@@ -1487,6 +1546,7 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
         )
         model_inputs["position_ids"] = None
+        model_inputs["images_per_sample"] = images_per_sample
         if not is_first_iteration and use_cache:
             model_inputs["pixel_values"] = None
@@ -1523,11 +1583,42 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
         if expand_size == 1:
             return input_ids, model_kwargs
-        visual_keys = ["pixel_values", "image_grid_thw"]
+        visual_keys = ["pixel_values", "image_grid_thw", "images_per_sample"]
         def _expand_dict_for_generation_visual(dict_to_expand):
             image_grid_thw = model_kwargs.get("image_grid_thw", None)
-            image_nums = self._get_image_nums(input_ids)
+            if image_grid_thw is None:
+                return dict_to_expand
+            images_per_sample = model_kwargs.get("images_per_sample", None)
+            # Use images_per_sample if available
+            if images_per_sample is not None:
+                image_nums = images_per_sample.tolist()
+            elif input_ids is not None:
+                # Try to infer from image_grid_thw / batch_size
+                batch_size = input_ids.shape[0]
+                total_grids = image_grid_thw.shape[0]
+                if total_grids % batch_size == 0:
+                    grids_per_sample = total_grids // batch_size
+                    image_nums = [grids_per_sample] * batch_size
+                else:
+                    # Cannot evenly distribute grids - fall back to simple repeat_interleave
+                    # This handles test cases where image_grid_thw has (batch_size + 1) rows
+                    dict_to_expand["image_grid_thw"] = image_grid_thw.repeat_interleave(expand_size, dim=0)
+                    if dict_to_expand.get("pixel_values") is not None:
+                        dict_to_expand["pixel_values"] = dict_to_expand["pixel_values"].repeat_interleave(
+                            expand_size, dim=0
+                        )
+                    return dict_to_expand
+            else:
+                image_nums = self._get_image_nums(input_ids).tolist()
+            # Get source image counts per sample from image_end_token_id count
+            source_image_nums = [
+                (input_ids[batch_idx] == self.config.image_end_token_id).sum().item()
+                for batch_idx in range(len(image_nums))
+            ]
             def _repeat_interleave_samples(x, lengths, repeat_times):
                 samples = torch.split(x, lengths)
@@ -1537,21 +1628,31 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
             for key in dict_to_expand:
                 if key == "pixel_values":
-                    # split images into samples
-                    samples = torch.split(image_grid_thw[: sum(image_nums)], list(image_nums))
-                    # compute the sequence length of images for each sample
-                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
-                    dict_to_expand[key] = _repeat_interleave_samples(
-                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
-                    )
+                    # Split images into samples based on source image counts
+                    if sum(source_image_nums) > 0:
+                        # Split grids by sample to compute pixel counts
+                        grids_per_sample = torch.split(image_grid_thw, image_nums)
+                        lengths = []
+                        for batch_idx, sample_grids in enumerate(grids_per_sample):
+                            num_source = source_image_nums[batch_idx]
+                            if num_source > 0:
+                                source_grids = sample_grids[:num_source]
+                                lengths.append(torch.prod(source_grids, dim=1).sum().item())
+                            else:
+                                lengths.append(0)
+                        dict_to_expand[key] = _repeat_interleave_samples(
+                            dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                        )
                 elif key == "image_grid_thw":
-                    # get the num of images for each sample and +1 for the image being generated
-                    lengths = list(image_nums)
-                    last_image = dict_to_expand[key][:-1]
+                    # Expand all grids (source + target) per sample
                     dict_to_expand[key] = _repeat_interleave_samples(
-                        dict_to_expand[key][: sum(image_nums)], lengths=lengths, repeat_times=expand_size
+                        dict_to_expand[key], lengths=image_nums, repeat_times=expand_size
                     )
-                    dict_to_expand[key] = torch.cat([dict_to_expand[key], last_image], dim=0)
+                elif key == "images_per_sample":
+                    # Simply repeat the counts
+                    if dict_to_expand.get(key) is not None:
+                        dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
             return dict_to_expand
         def _expand_dict_for_generation(dict_to_expand):

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl