PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/models/git/modeling_git.py CHANGED Viewed

@@ -26,8 +26,9 @@ from torch import nn
 from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
+from ...configuration_utils import PreTrainedConfig
 from ...generation import GenerationMixin
-from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...masking_utils import create_masks_for_generate
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import (
     BaseModelOutput,
@@ -69,6 +70,104 @@ class GitVisionModelOutput(ModelOutput):
     attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+# Copied from transformers.models.gemma3.modeling_gemma3.token_type_ids_mask_function
+def token_type_ids_mask_function(
+    token_type_ids: Optional[torch.Tensor],
+    image_group_ids: Optional[torch.Tensor],
+) -> Optional[Callable]:
+    """
+    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
+    not start and end indices.
+    """
+    # Do not return an additional mask in this case
+    if token_type_ids is None:
+        return None
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        # If it's 1 for both query and key/value, we are in an image block
+        # NOTE: static cache shape goes beyond input seq length, while token_type_ids.shape[1] == input seq length
+        # Since vmap doesn't support `if statement` we workaround it with `torch.where`
+        safe_q_idx = torch.where(q_idx < token_type_ids.shape[1], q_idx, 0)
+        safe_kv_idx = torch.where(kv_idx < token_type_ids.shape[1], kv_idx, 0)
+        token_type_ids_at_q_idx = token_type_ids[batch_idx, safe_q_idx]
+        token_type_ids_at_q_idx = torch.where(q_idx < token_type_ids.shape[1], token_type_ids_at_q_idx, 0)
+        token_type_ids_at_kv_idx = token_type_ids[batch_idx, safe_kv_idx]
+        token_type_ids_at_kv_idx = torch.where(kv_idx < token_type_ids.shape[1], token_type_ids_at_kv_idx, 0)
+        image_group_ids_at_q_idx = image_group_ids[batch_idx, safe_q_idx]
+        image_group_ids_at_q_idx = torch.where(q_idx < image_group_ids.shape[1], image_group_ids_at_q_idx, -1)
+        image_group_ids_at_kv_idx = image_group_ids[batch_idx, safe_kv_idx]
+        image_group_ids_at_kv_idx = torch.where(kv_idx < image_group_ids.shape[1], image_group_ids_at_kv_idx, -1)
+        is_image_block = (token_type_ids_at_q_idx == 1) & (token_type_ids_at_kv_idx == 1)
+        same_image_block = image_group_ids_at_q_idx == image_group_ids_at_kv_idx
+        # This is bidirectional attention whenever we are dealing with image tokens
+        return is_image_block & same_image_block
+    return inner_mask
+# Copied from transformers.models.gemma3.modeling_gemma3.create_causal_mask_mapping
+def create_causal_mask_mapping(
+    config: PreTrainedConfig,
+    input_embeds: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    cache_position: torch.Tensor,
+    past_key_values: Optional[Cache],
+    position_ids: Optional[torch.Tensor],
+    token_type_ids: Optional[torch.Tensor] = None,
+    pixel_values: Optional[torch.FloatTensor] = None,
+    is_training: bool = False,
+    is_first_iteration: Optional[bool] = None,
+    **kwargs,
+) -> dict:
+    """
+    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
+    for all kinds of forward passes. Gemma3 uses a bidirectional mask for images.
+    Uses `pixel_values` as an optional input to disambiguate edge cases.
+    """
+    if is_training and token_type_ids is None:
+        raise ValueError("`token_type_ids` is required as a model input when training")
+    mask_kwargs = {
+        "config": config.get_text_config(),
+        "input_embeds": input_embeds,
+        "attention_mask": attention_mask,
+        "cache_position": cache_position,
+        "past_key_values": past_key_values,
+        "position_ids": position_ids,
+    }
+    # NOTE: this `may_have_image_input` logic is not flawless, it fails when we're using a cache eagerly initialized
+    # (e.g. compiled prefill) AND `pixel_values` are not provided (i.e. the image data is provided through other
+    # means). Determining prefill in that case requires checking data values, which is not compile-compatible.
+    is_first_iteration = (
+        is_first_iteration
+        if is_first_iteration is not None
+        else (past_key_values is None or not past_key_values.is_initialized or pixel_values is not None)
+    )
+    if token_type_ids is not None and is_first_iteration:
+        # We need to pass an additional mask function to account for token type ids, and it needs to be an `or` (to
+        # undo the causal masking)
+        # First find where a new image block starts: 1 if image and previous not image
+        # The images cannot attend to future images, but can attend to all prev images and to itself bidirectionally
+        is_image = (token_type_ids == 1).to(cache_position.device)
+        is_previous_image = nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]
+        new_image_start = is_image & ~is_previous_image
+        image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1
+        image_group_ids = torch.where(is_image, image_group_ids, -1)
+        mask_kwargs["or_mask_function"] = token_type_ids_mask_function(
+            token_type_ids.to(cache_position.device), image_group_ids
+        )
+    return create_masks_for_generate(**mask_kwargs)
 class GitEmbeddings(nn.Module):
     """Construct the embeddings from word and position embeddings."""
@@ -148,17 +247,15 @@ class GitSelfAttention(nn.Module):
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.FloatTensor] = None,
         past_key_values: Optional[Cache] = None,
-        output_attentions: Optional[bool] = False,
-        pixel_values_present: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor]:
-        batch_size, seq_length, _ = hidden_states.shape
+        batch_size = hidden_states.shape[0]
         query_layer = (
             self.query(hidden_states)
             .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
             .transpose(1, 2)
         )
-        cutoff = self.image_patch_tokens if pixel_values_present else 0
         key_layer = (
             self.key(hidden_states)
             .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
@@ -170,12 +267,9 @@ class GitSelfAttention(nn.Module):
             .transpose(1, 2)
         )
         if past_key_values is not None:
-            # NOTE: like in other caches, we store the text component. In GIT it means we discard the image component.
-            key_layer_past, value_layer_past = past_key_values.update(
-                key_layer[:, :, cutoff:, :], value_layer[:, :, cutoff:, :], self.layer_idx
+            key_layer, value_layer = past_key_values.update(
+                key_layer, value_layer, self.layer_idx, cache_kwargs={"cache_position": cache_position}
             )
-            key_layer = torch.cat([key_layer[:, :, :cutoff, :], key_layer_past], dim=2)
-            value_layer = torch.cat([value_layer[:, :, :cutoff, :], value_layer_past], dim=2)
         # Take the dot product between "query" and "key" to get the raw attention scores.
         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
@@ -232,15 +326,14 @@ class GitAttention(nn.Module):
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.FloatTensor] = None,
         past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
-        pixel_values_present: Optional[bool] = False,
     ) -> tuple[torch.Tensor]:
         attn_output, self_attn_weights = self.self(
             hidden_states,
             attention_mask,
             past_key_values,
-            output_attentions,
-            pixel_values_present,
+            cache_position=cache_position,
         )
         attention_output = self.output(attn_output, hidden_states)
         return attention_output, self_attn_weights
@@ -291,8 +384,8 @@ class GitLayer(GradientCheckpointingLayer):
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.FloatTensor] = None,
         past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
-        pixel_values_present: Optional[bool] = False,
     ) -> tuple[torch.Tensor]:
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
         attention_output, self_attention_weights = self.attention(
@@ -300,7 +393,7 @@ class GitLayer(GradientCheckpointingLayer):
             attention_mask,
             output_attentions=output_attentions,
             past_key_values=past_key_values,
-            pixel_values_present=pixel_values_present,
+            cache_position=cache_position,
         )
         layer_output = apply_chunking_to_forward(
@@ -329,8 +422,8 @@ class GitEncoder(nn.Module):
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
-        pixel_values_present: Optional[bool] = False,
         return_dict: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
     ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPast]:
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -353,7 +446,7 @@ class GitEncoder(nn.Module):
                 attention_mask,
                 past_key_values,
                 output_attentions,
-                pixel_values_present,
+                cache_position,
             )
             hidden_states = layer_outputs[0]
@@ -396,6 +489,7 @@ class GitPreTrainedModel(PreTrainedModel):
             init.normal_(module.class_embedding, mean=0.0, std=self.config.initializer_range)
             init.normal_(module.patch_embedding.weight, std=self.config.initializer_range)
             init.normal_(module.position_embedding.weight, std=self.config.initializer_range)
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
         if isinstance(module, nn.Linear):
             init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
@@ -408,6 +502,8 @@ class GitPreTrainedModel(PreTrainedModel):
         elif isinstance(module, nn.LayerNorm):
             init.zeros_(module.bias)
             init.ones_(module.weight)
+        elif isinstance(module, GitEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
 # Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->Git
@@ -903,62 +999,6 @@ class GitModel(GitPreTrainedModel):
     def set_input_embeddings(self, value):
         self.embeddings.word_embeddings = value
-    def _generate_future_mask(self, size: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-        # Default mask is for forward direction. Flip for backward direction.
-        mask = torch.triu(torch.ones(size, size, device=device, dtype=dtype), diagonal=1)
-        mask = mask.masked_fill(mask == 1, float("-inf"))
-        return mask
-    def create_attention_mask(self, tgt, memory, tgt_mask, past_key_values_length, memory_key_padding_mask=None):
-        num_tgt = tgt.shape[1]
-        num_memory = memory.shape[1]
-        device = tgt.device
-        dtype = tgt.dtype
-        top_left = torch.zeros((num_memory, num_memory), device=device, dtype=dtype)
-        top_right = torch.full(
-            (num_memory, num_tgt + past_key_values_length),
-            float("-inf"),
-            device=tgt.device,
-            dtype=dtype,
-        )
-        bottom_left = torch.zeros(
-            (num_tgt, num_memory),
-            dtype=dtype,
-            device=tgt_mask.device,
-        )
-        if past_key_values_length > 0:
-            tgt_mask = torch.zeros(
-                (tgt_mask.shape[0], tgt_mask.shape[0] + past_key_values_length),
-                dtype=dtype,
-                device=tgt_mask.device,
-            )
-        left = torch.cat((top_left, bottom_left), dim=0)
-        right = torch.cat((top_right, tgt_mask.to(dtype)), dim=0)
-        full_attention_mask = torch.cat((left, right), dim=1)[None, :]
-        if memory_key_padding_mask is None:
-            memory_key_padding_mask = torch.full((memory.shape[0], memory.shape[1]), fill_value=False, device=device)
-        # if it is False, it means valid. That is, it is not a padding
-        if memory_key_padding_mask.dtype != torch.bool:
-            raise ValueError("Memory key padding mask must be a boolean tensor.")
-        zero_negative_infinity = torch.zeros_like(memory_key_padding_mask, dtype=tgt.dtype)
-        zero_negative_infinity[memory_key_padding_mask] = float("-inf")
-        full_attention_mask = full_attention_mask.expand(
-            (memory_key_padding_mask.shape[0], num_memory + num_tgt, num_memory + past_key_values_length + num_tgt)
-        )
-        full_attention_mask = full_attention_mask.clone()
-        origin_left = full_attention_mask[:, :, :num_memory]
-        update = zero_negative_infinity[:, None, :]
-        full_attention_mask[:, :, :num_memory] = origin_left + update
-        # add axis for multi-head
-        full_attention_mask = full_attention_mask[:, None, :, :]
-        return full_attention_mask
     @auto_docstring
     def forward(
         self,
@@ -973,6 +1013,7 @@ class GitModel(GitPreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: bool = False,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPooling]:
         r"""
@@ -1005,15 +1046,6 @@ class GitModel(GitPreTrainedModel):
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-        seq_length = input_shape[1]
         # past_key_values_length
         past_key_values_length = 0
@@ -1024,7 +1056,23 @@ class GitModel(GitPreTrainedModel):
                 else past_key_values.get_seq_length()
             )
-        projected_visual_features = None
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length,
+                past_key_values_length + embedding_output.shape[1],
+                device=embedding_output.device,
+            )
+        # Always create `token_type_ids` so we can re-use Gemma3 style mask preparation fn
+        token_type_ids = torch.zeros_like(embedding_output, dtype=torch.int)[..., 0]
         if pixel_values is not None:
             if pixel_values.ndim == 4:
                 # here we assume pixel_values is of shape (batch_size, num_channels, height, width)
@@ -1050,60 +1098,54 @@ class GitModel(GitPreTrainedModel):
             projected_visual_features = self.visual_projection(visual_features)
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length,
-        )
-        if projected_visual_features is None:
-            projected_visual_features = torch.zeros(
-                (embedding_output.shape[0], 0, embedding_output.shape[2]),
-                dtype=embedding_output.dtype,
-                device=embedding_output.device,
+            # Repeat visual features to match embedding batch size.
+            projected_visual_features = projected_visual_features.repeat(
+                embedding_output.size(0) // projected_visual_features.size(0), 1, 1
             )
-        # Repeat visual features to match embedding batch size.
-        projected_visual_features = projected_visual_features.repeat(
-            embedding_output.size(0) // projected_visual_features.size(0), 1, 1
-        )
-        # concatenate patch token and text token embeddings
-        hidden_states = torch.cat((projected_visual_features, embedding_output), dim=1)
-        # By default, an additive causal mask is created
-        # for masking the future (one direction).
-        tgt_mask = self._generate_future_mask(seq_length, embedding_output.dtype, embedding_output.device)
+            # concatenate patch token and text token embeddings
+            embedding_output = torch.cat((projected_visual_features, embedding_output), dim=1)
+            image_token_type_ids = torch.ones_like(projected_visual_features, dtype=torch.int)[..., 0]
+            token_type_ids = torch.cat([image_token_type_ids, token_type_ids], dim=-1)
+            cache_position = torch.arange(embedding_output.shape[1], device=embedding_output.device, dtype=torch.int)
+            if attention_mask is not None:
+                attention_mask = torch.cat([torch.ones_like(image_token_type_ids), attention_mask], dim=-1)
+        elif past_key_values is not None and input_ids.shape[1] == 1:
+            # Expand attention mask and cache position with image tokens because GIT doesn't add image
+            # placeholder tokens when processing. Doesn't worth the refactor, low usage!
+            cache_position = torch.tensor(
+                [past_key_values_length], dtype=cache_position.dtype, device=cache_position.device
+            )
+            extended_attention_mask = torch.ones(
+                (attention_mask.shape[0], past_key_values_length - attention_mask.shape[1] + 1),
+                dtype=attention_mask.dtype,
+                device=attention_mask.device,
+            )
+            attention_mask = torch.cat([extended_attention_mask, attention_mask], dim=-1)
-        # Create an attention mask of shape (batch_size, 1, tgt_seq_len, src_seq_len)
-        combined_attention_mask = self.create_attention_mask(
-            tgt=embedding_output,
-            memory=projected_visual_features,
-            tgt_mask=tgt_mask,
-            past_key_values_length=past_key_values_length,
+        # Images attend each other bidirectionally while text remains causal
+        causal_mask = create_causal_mask_mapping(
+            self.config,
+            embedding_output,
+            attention_mask,
+            cache_position,
+            past_key_values,
+            None,
+            token_type_ids,
+            pixel_values,
         )
-        if attention_mask is not None:
-            # if the user provides an attention mask, we add it to the default one
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _prepare_4d_attention_mask(
-                attention_mask, embedding_output.dtype, tgt_len=input_shape[-1]
-            ).to(embedding_output.device)
-            if past_key_values_length > 0:
-                expanded_attn_mask = expanded_attn_mask[:, :, -past_key_values_length:, :]
-            else:
-                combined_attention_mask[:, :, -input_shape[1] :, -input_shape[1] :] += expanded_attn_mask
+        hidden_states = embedding_output
         encoder_outputs = self.encoder(
             hidden_states,
-            attention_mask=combined_attention_mask,
+            attention_mask=causal_mask,
             past_key_values=past_key_values,
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
-            pixel_values_present=pixel_values is not None,
+            cache_position=cache_position,
         )
         sequence_output = encoder_outputs[0]
@@ -1157,6 +1199,7 @@ class GitForCausalLM(GitPreTrainedModel, GenerationMixin):
         interpolate_pos_encoding: bool = False,
         return_dict: Optional[bool] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
+        cache_position: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[tuple[torch.Tensor], CausalLMOutputWithPast]:
         r"""
@@ -1306,6 +1349,7 @@ class GitForCausalLM(GitPreTrainedModel, GenerationMixin):
             output_hidden_states=output_hidden_states,
             interpolate_pos_encoding=interpolate_pos_encoding,
             return_dict=return_dict,
+            cache_position=cache_position,
         )
         hidden_states = outputs[0]
@@ -1339,7 +1383,15 @@ class GitForCausalLM(GitPreTrainedModel, GenerationMixin):
         )
     def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
+        self,
+        input_ids,
+        past_key_values=None,
+        pixel_values=None,
+        attention_mask=None,
+        use_cache=None,
+        cache_position=None,
+        is_first_iteration=False,
+        **kwargs,
     ):
         # Overwritten -- `git` has special cache handling and doesn't support generating from `inputs_embeds` atm
@@ -1364,11 +1416,14 @@ class GitForCausalLM(GitPreTrainedModel, GenerationMixin):
         model_inputs = {
             "input_ids": input_ids,
             "attention_mask": attention_mask,
-            "pixel_values": kwargs.get("pixel_values"),
             "past_key_values": past_key_values,
             "use_cache": use_cache,
+            "cache_position": cache_position,
         }
+        if is_first_iteration or not use_cache:
+            model_inputs["pixel_values"] = pixel_values
         # Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
         for key, value in kwargs.items():
             if key not in model_inputs:

transformers/models/glm/modeling_glm.py CHANGED Viewed

@@ -79,7 +79,7 @@ class GlmRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(

transformers/models/glm4/modeling_glm4.py CHANGED Viewed

@@ -284,7 +284,7 @@ class Glm4RotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(

transformers/models/glm46v/image_processing_glm46v.py CHANGED Viewed

@@ -354,7 +354,6 @@ class Glm46VImageProcessor(BaseImageProcessor):
             image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                 Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
                 `True`.
-                The max pixels of the image to resize the image.
             patch_size (`int`, *optional*, defaults to `self.patch_size`):
                 The spatial patch size of the vision encoder.
             temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
@@ -381,12 +380,9 @@ class Glm46VImageProcessor(BaseImageProcessor):
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
-        # Try to use config values if set, otherwise fallback to global defaults
         size = size if size is not None else self.size
         if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
             raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
-        elif size is None:
-            size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}
         do_resize = do_resize if do_resize is not None else self.do_resize
         resample = resample if resample is not None else self.resample

transformers/models/glm46v/modeling_glm46v.py CHANGED Viewed

@@ -639,6 +639,7 @@ class Glm46VForConditionalGeneration(Glm46VPreTrainedModel, GenerationMixin):
         pixel_values_videos=None,
         image_grid_thw=None,
         video_grid_thw=None,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -655,13 +656,14 @@ class Glm46VForConditionalGeneration(Glm46VPreTrainedModel, GenerationMixin):
             image_grid_thw=image_grid_thw,
             video_grid_thw=video_grid_thw,
             use_cache=use_cache,
+            is_first_iteration=is_first_iteration,
             **kwargs,
         )
         # GLM-4.1V position_ids are prepareed with rope_deltas in forward
         model_inputs["position_ids"] = None
-        if cache_position[0] != 0:
+        if not is_first_iteration and use_cache:
             model_inputs["pixel_values"] = None
             model_inputs["pixel_values_videos"] = None

transformers/models/glm46v/modular_glm46v.py CHANGED Viewed

@@ -110,6 +110,9 @@ class Glm46VPreTrainedModel(Glm4vPreTrainedModel):
     _can_record_outputs = None
     _no_split_modules = None
+    def _init_weights(self, module):
+        raise AttributeError("Not needed")
 class Glm46VModel(Glm4vModel):
     _no_split_modules = None

transformers/models/glm4_moe/modeling_glm4_moe.py CHANGED Viewed

@@ -30,7 +30,7 @@ from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
+from ...integrations import use_experts_implementation, use_kernel_forward_from_hub, use_kernelized_func
 from ...masking_utils import create_causal_mask
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
@@ -38,7 +38,7 @@ from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
 from ...utils.generic import check_model_inputs, maybe_autocast
 from .configuration_glm4_moe import Glm4MoeConfig
@@ -60,7 +60,7 @@ class Glm4MoeRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(
@@ -332,6 +332,7 @@ class Glm4MoeRMSNorm(nn.Module):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+@use_experts_implementation
 class Glm4MoeNaiveMoe(nn.Module):
     """Collection of expert weights stored as 3D tensors."""
@@ -339,7 +340,7 @@ class Glm4MoeNaiveMoe(nn.Module):
         super().__init__()
         self.num_experts = config.num_local_experts
         self.hidden_dim = config.hidden_size
-        self.intermediate_dim = config.intermediate_size
+        self.intermediate_dim = config.moe_intermediate_size
         self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, 2 * self.intermediate_dim, self.hidden_dim))
         self.down_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, self.intermediate_dim))
         self.act_fn = ACT2FN[config.hidden_act]
@@ -486,7 +487,9 @@ class Glm4MoePreTrainedModel(PreTrainedModel):
     _supports_flash_attn = True
     _supports_sdpa = True
     _supports_flex_attn = True
-    _can_compile_fullgraph = False
+    _can_compile_fullgraph = (
+        is_grouped_mm_available()
+    )  # https://huggingface.co/docs/transformers/experts_interface#torchcompile
     _supports_attention_backend = True
     _can_record_outputs = {
         "hidden_states": Glm4MoeDecoderLayer,
@@ -499,6 +502,7 @@ class Glm4MoePreTrainedModel(PreTrainedModel):
         super()._init_weights(module)
         if isinstance(module, Glm4MoeTopkRouter):
             init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            init.zeros_(module.e_score_correction_bias)
         elif isinstance(module, Glm4MoeNaiveMoe):
             init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
             init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)

transformers/models/glm4_moe/modular_glm4_moe.py CHANGED Viewed

@@ -273,7 +273,7 @@ class Glm4MoeDecoderLayer(DeepseekV3DecoderLayer):
 class Glm4MoePreTrainedModel(DeepseekV3PreTrainedModel):
-    _can_compile_fullgraph = False
+    pass
 class Glm4MoeModel(DeepseekV3Model):

transformers/models/glm4v/image_processing_glm4v.py CHANGED Viewed

@@ -353,7 +353,6 @@ class Glm4vImageProcessor(BaseImageProcessor):
             image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                 Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
                 `True`.
-                The max pixels of the image to resize the image.
             patch_size (`int`, *optional*, defaults to `self.patch_size`):
                 The spatial patch size of the vision encoder.
             temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
@@ -380,12 +379,9 @@ class Glm4vImageProcessor(BaseImageProcessor):
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
-        # Try to use config values if set, otherwise fallback to global defaults
         size = size if size is not None else self.size
         if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
             raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
-        elif size is None:
-            size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}
         do_resize = do_resize if do_resize is not None else self.do_resize
         resample = resample if resample is not None else self.resample

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl