PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/models/layoutlmv3/modeling_layoutlmv3.py CHANGED Viewed

@@ -212,6 +212,10 @@ class LayoutLMv3PreTrainedModel(PreTrainedModel):
             if self.config.visual_embed:
                 init.zeros_(module.cls_token)
                 init.zeros_(module.pos_embed)
+            if hasattr(module, "visual_bbox"):
+                init.copy_(module.visual_bbox, module.create_visual_bbox(image_size=(module.size, module.size)))
+        elif isinstance(module, LayoutLMv3TextEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
 class LayoutLMv3SelfAttention(nn.Module):
@@ -576,16 +580,18 @@ class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
             # when the input_size is larger in fine-tuning, we will interpolate the position embeddings in forward
             self.patch_embed = LayoutLMv3PatchEmbeddings(config)
-            size = int(config.input_size / config.patch_size)
+            self.size = int(config.input_size / config.patch_size)
             self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
-            self.pos_embed = nn.Parameter(torch.zeros(1, size * size + 1, config.hidden_size))
+            self.pos_embed = nn.Parameter(torch.zeros(1, self.size * self.size + 1, config.hidden_size))
             self.pos_drop = nn.Dropout(p=0.0)
             self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
             self.dropout = nn.Dropout(config.hidden_dropout_prob)
             if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
-                self.init_visual_bbox(image_size=(size, size))
+                self.register_buffer(
+                    "visual_bbox", self.create_visual_bbox(image_size=(self.size, self.size)), persistent=False
+                )
             self.norm = nn.LayerNorm(config.hidden_size, eps=1e-6)
@@ -599,7 +605,7 @@ class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
     def set_input_embeddings(self, value):
         self.embeddings.word_embeddings = value
-    def init_visual_bbox(self, image_size=(14, 14), max_len=1000):
+    def create_visual_bbox(self, image_size=(14, 14), max_len=1000):
         """
         Create the bounding boxes for the visual (patch) tokens.
         """
@@ -620,7 +626,7 @@ class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
         ).view(-1, 4)
         cls_token_box = torch.tensor([[0 + 1, 0 + 1, max_len - 1, max_len - 1]])
-        self.visual_bbox = torch.cat([cls_token_box, visual_bbox], dim=0)
+        return torch.cat([cls_token_box, visual_bbox], dim=0)
     def calculate_visual_bbox(self, device, dtype, batch_size):
         visual_bbox = self.visual_bbox.repeat(batch_size, 1, 1)
@@ -884,6 +890,12 @@ class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
         self.post_init()
+    def get_input_embeddings(self):
+        return self.layoutlmv3.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.layoutlmv3.set_input_embeddings(value)
     @auto_docstring
     def forward(
         self,
@@ -984,6 +996,12 @@ class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
         self.post_init()
+    def get_input_embeddings(self):
+        return self.layoutlmv3.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.layoutlmv3.set_input_embeddings(value)
     @auto_docstring
     def forward(
         self,
@@ -1104,6 +1122,12 @@ class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
         self.post_init()
+    def get_input_embeddings(self):
+        return self.layoutlmv3.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.layoutlmv3.set_input_embeddings(value)
     @auto_docstring
     def forward(
         self,

transformers/models/led/modeling_led.py CHANGED Viewed

@@ -23,6 +23,7 @@ import torch
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...generation import GenerationMixin
@@ -1077,6 +1078,11 @@ class LEDPreTrainedModel(PreTrainedModel):
         }
         return dummy_inputs
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, LEDForConditionalGeneration):
+            init.zeros_(module.final_logits_bias)
 @dataclass
 @auto_docstring(

transformers/models/levit/modeling_levit.py CHANGED Viewed

@@ -21,6 +21,7 @@ from typing import Optional, Union
 import torch
 from torch import nn
+from ... import initialization as init
 from ...modeling_outputs import (
     BaseModelOutputWithNoAttention,
     BaseModelOutputWithPoolingAndNoAttention,
@@ -165,6 +166,7 @@ class LevitAttention(nn.Module):
         points = list(itertools.product(range(resolution), range(resolution)))
         len_points = len(points)
+        self.len_points = len_points
         attention_offsets, indices = {}, []
         for p1 in points:
             for p2 in points:
@@ -172,6 +174,7 @@ class LevitAttention(nn.Module):
                 if offset not in attention_offsets:
                     attention_offsets[offset] = len(attention_offsets)
                 indices.append(attention_offsets[offset])
+        self.indices = indices
         self.attention_bias_cache = {}
         self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
@@ -243,6 +246,8 @@ class LevitAttentionSubsample(nn.Module):
         points = list(itertools.product(range(resolution_in), range(resolution_in)))
         points_ = list(itertools.product(range(resolution_out), range(resolution_out)))
         len_points, len_points_ = len(points), len(points_)
+        self.len_points_ = len_points_
+        self.len_points = len_points
         attention_offsets, indices = {}, []
         for p1 in points_:
             for p2 in points:
@@ -251,6 +256,7 @@ class LevitAttentionSubsample(nn.Module):
                 if offset not in attention_offsets:
                     attention_offsets[offset] = len(attention_offsets)
                 indices.append(attention_offsets[offset])
+        self.indices = indices
         self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
         self.register_buffer(
@@ -472,6 +478,18 @@ class LevitPreTrainedModel(PreTrainedModel):
     input_modalities = ("image",)
     _no_split_modules = ["LevitResidualLayer"]
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, LevitAttention):
+            init.copy_(
+                module.attention_bias_idxs, torch.LongTensor(module.indices).view(module.len_points, module.len_points)
+            )
+        elif isinstance(module, LevitAttentionSubsample):
+            init.copy_(
+                module.attention_bias_idxs,
+                torch.LongTensor(module.indices).view(module.len_points_, module.len_points),
+            )
 @auto_docstring
 class LevitModel(LevitPreTrainedModel):

transformers/models/lfm2/modeling_lfm2.py CHANGED Viewed

@@ -83,7 +83,7 @@ class Lfm2RotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(

transformers/models/lfm2_moe/modeling_lfm2_moe.py CHANGED Viewed

@@ -27,7 +27,12 @@ from torch import nn
 from ... import initialization as init
 from ...cache_utils import Cache
 from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
+from ...integrations import (
+    use_experts_implementation,
+    use_kernel_forward_from_hub,
+    use_kernel_func_from_hub,
+    use_kernelized_func,
+)
 from ...masking_utils import create_causal_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, MoeModelOutputWithPast
@@ -84,7 +89,7 @@ class Lfm2MoeRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(
@@ -145,6 +150,7 @@ class Lfm2MoeMLP(nn.Module):
         return self.w2(F.silu(self.w1(x)) * self.w3(x))
+@use_experts_implementation
 class Lfm2MoeExperts(nn.Module):
     """Collection of expert weights stored as 3D tensors."""
@@ -155,6 +161,7 @@ class Lfm2MoeExperts(nn.Module):
         self.intermediate_dim = config.moe_intermediate_size
         self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, 2 * self.intermediate_dim, self.hidden_dim))
         self.down_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, self.intermediate_dim))
+        self.act_fn = F.silu
     def forward(
         self,
@@ -175,7 +182,7 @@ class Lfm2MoeExperts(nn.Module):
             top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
             current_state = hidden_states[token_idx]
             gate, up = nn.functional.linear(current_state, self.gate_up_proj[expert_idx]).chunk(2, dim=-1)
-            current_hidden_states = F.silu(gate) * up
+            current_hidden_states = self.act_fn(gate) * up
             current_hidden_states = nn.functional.linear(current_hidden_states, self.down_proj[expert_idx])
             current_hidden_states = current_hidden_states * top_k_weights[token_idx, top_k_pos, None]
             final_hidden_states.index_add_(0, token_idx, current_hidden_states.to(final_hidden_states.dtype))
@@ -671,7 +678,7 @@ class Lfm2MoePreTrainedModel(PreTrainedModel):
     _supports_flash_attn = True
     _supports_sdpa = True
     _supports_flex_attn = True
-    _can_compile_fullgraph = False
+    _can_compile_fullgraph = False  # uses a non-compilable custom cache class Lfm2MoeHybridConvCache
     _supports_attention_backend = True
     _can_record_outputs = {
         "hidden_states": Lfm2MoeDecoderLayer,
@@ -684,6 +691,9 @@ class Lfm2MoePreTrainedModel(PreTrainedModel):
         if isinstance(module, Lfm2MoeExperts):
             init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
             init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, Lfm2MoeSparseMoeBlock):
+            if module.use_expert_bias:
+                init.zeros_(module.expert_bias)
 @auto_docstring

transformers/models/lfm2_moe/modular_lfm2_moe.py CHANGED Viewed

@@ -72,33 +72,7 @@ class Lfm2MoeMLP(Lfm2MLP):
 class Lfm2MoeExperts(Qwen2MoeExperts):
     def __init__(self, config):
         super().__init__(config)
-        del self.act_fn
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        top_k_index: torch.Tensor,
-        top_k_weights: torch.Tensor,
-    ) -> torch.Tensor:
-        final_hidden_states = torch.zeros_like(hidden_states)
-        with torch.no_grad():
-            expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts)
-            expert_mask = expert_mask.permute(2, 1, 0)
-            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-        for expert_idx in expert_hit:
-            expert_idx = expert_idx[0]
-            if expert_idx == self.num_experts:
-                continue
-            top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
-            current_state = hidden_states[token_idx]
-            gate, up = nn.functional.linear(current_state, self.gate_up_proj[expert_idx]).chunk(2, dim=-1)
-            current_hidden_states = F.silu(gate) * up
-            current_hidden_states = nn.functional.linear(current_hidden_states, self.down_proj[expert_idx])
-            current_hidden_states = current_hidden_states * top_k_weights[token_idx, top_k_pos, None]
-            final_hidden_states.index_add_(0, token_idx, current_hidden_states.to(final_hidden_states.dtype))
-        return final_hidden_states
+        self.act_fn = F.silu
 class Lfm2MoeSparseMoeBlock(nn.Module):
@@ -160,7 +134,7 @@ class Lfm2MoeDecoderLayer(Lfm2DecoderLayer):
 class Lfm2MoePreTrainedModel(LlamaPreTrainedModel):
-    _can_compile_fullgraph = False
+    _can_compile_fullgraph = False  # uses a non-compilable custom cache class Lfm2MoeHybridConvCache
     @torch.no_grad()
     def _init_weights(self, module):
@@ -168,6 +142,9 @@ class Lfm2MoePreTrainedModel(LlamaPreTrainedModel):
         if isinstance(module, Lfm2MoeExperts):
             init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
             init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, Lfm2MoeSparseMoeBlock):
+            if module.use_expert_bias:
+                init.zeros_(module.expert_bias)
 class Lfm2MoeModel(MixtralModel):

transformers/models/lfm2_vl/configuration_lfm2_vl.py CHANGED Viewed

@@ -46,6 +46,8 @@ class Lfm2VlConfig(PreTrainedConfig):
             The hidden size of the multimodal projector.
         projector_bias (`bool`, *optional*, defaults to `True`):
             Whether to use bias in the multimodal projector.
+        projector_use_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether to use layernorm in the multimodal projector.
         downsample_factor (`int`, *optional*, defaults to 2):
             The downsample_factor factor of the vision backbone.
     """
@@ -61,6 +63,7 @@ class Lfm2VlConfig(PreTrainedConfig):
         projector_hidden_act="gelu",
         projector_hidden_size=2560,
         projector_bias=True,
+        projector_use_layernorm=True,
         downsample_factor=2,
         **kwargs,
     ):
@@ -68,6 +71,7 @@ class Lfm2VlConfig(PreTrainedConfig):
         self.projector_hidden_act = projector_hidden_act
         self.projector_hidden_size = projector_hidden_size
         self.projector_bias = projector_bias
+        self.projector_use_layernorm = projector_use_layernorm
         self.downsample_factor = downsample_factor
         if isinstance(vision_config, dict):

transformers/models/lfm2_vl/modeling_lfm2_vl.py CHANGED Viewed

@@ -41,7 +41,8 @@ class Lfm2VlMultiModalProjector(nn.Module):
         super().__init__()
         in_channels = config.vision_config.hidden_size * (config.downsample_factor**2)
         self.factor = config.downsample_factor
-        self.layer_norm = nn.LayerNorm(in_channels)
+        self.use_layer_norm = config.projector_use_layernorm
+        self.layer_norm = nn.LayerNorm(in_channels) if config.projector_use_layernorm else None
         self.linear_1 = nn.Linear(
             in_channels,
             config.projector_hidden_size,
@@ -56,7 +57,8 @@ class Lfm2VlMultiModalProjector(nn.Module):
     def forward(self, image_features: torch.Tensor):
         image_features = self.pixel_unshuffle(image_features)
-        image_features = self.layer_norm(image_features)
+        if self.use_layer_norm:
+            image_features = self.layer_norm(image_features)
         hidden_states = self.linear_1(image_features)
         hidden_states = self.act(hidden_states)
         hidden_states = self.linear_2(hidden_states)
@@ -448,6 +450,7 @@ class Lfm2VlForConditionalGeneration(Lfm2VlPreTrainedModel, GenerationMixin):
         attention_mask=None,
         cache_position=None,
         logits_to_keep=None,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -459,12 +462,15 @@ class Lfm2VlForConditionalGeneration(Lfm2VlPreTrainedModel, GenerationMixin):
             attention_mask=attention_mask,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
+            is_first_iteration=is_first_iteration,
             **kwargs,
         )
-        if cache_position[0] == 0:
-            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-            # Otherwise we need pixel values to be passed to model
+        if is_first_iteration or not kwargs.get("use_cache", True):
+            # Pixel values are used only in the first iteration if available
+            # In subsquent iterations, they are already merged with text and cached
+            # NOTE: first iteration doesn't have to be prefill, it can be the first
+            # iteration with a question and cached system prompt (continue generate from cache)
             model_inputs["pixel_values"] = pixel_values
         return model_inputs

transformers/models/lfm2_vl/modular_lfm2_vl.py CHANGED Viewed

@@ -41,7 +41,8 @@ class Lfm2VlMultiModalProjector(nn.Module):
         super().__init__()
         in_channels = config.vision_config.hidden_size * (config.downsample_factor**2)
         self.factor = config.downsample_factor
-        self.layer_norm = nn.LayerNorm(in_channels)
+        self.use_layer_norm = config.projector_use_layernorm
+        self.layer_norm = nn.LayerNorm(in_channels) if config.projector_use_layernorm else None
         self.linear_1 = nn.Linear(
             in_channels,
             config.projector_hidden_size,
@@ -56,7 +57,8 @@ class Lfm2VlMultiModalProjector(nn.Module):
     def forward(self, image_features: torch.Tensor):
         image_features = self.pixel_unshuffle(image_features)
-        image_features = self.layer_norm(image_features)
+        if self.use_layer_norm:
+            image_features = self.layer_norm(image_features)
         hidden_states = self.linear_1(image_features)
         hidden_states = self.act(hidden_states)
         hidden_states = self.linear_2(hidden_states)

transformers/models/lfm2_vl/processing_lfm2_vl.py CHANGED Viewed

@@ -165,63 +165,103 @@ class Lfm2VlProcessor(ProcessorMixin):
         image_sizes: list[list[int]],
         use_image_special_tokens: bool,
         **images_kwargs,
-    ):
-        prompt_strings = []
+    ) -> list[str]:
+        use_thumbnail = images_kwargs.get("use_thumbnail", self.image_processor.use_thumbnail)
+        image_data = iter(zip(image_rows, image_cols, image_sizes))
-        image_data = iter(zip(*[image_rows, image_cols, image_sizes]))
+        prompt_strings = []
         for sample_text, sample_images in zip(text, images):
-            split_sample = sample_text.split(self.image_token)
-            sample_text_with_image_tokens = ""
-            for i, image in enumerate(sample_images):
-                sample_text_with_image_tokens += split_sample[i]
-                if use_image_special_tokens:
-                    sample_text_with_image_tokens += self.image_start_token
+            text_parts = sample_text.split(self.image_token)
+            result_parts = []
+            for i, _ in enumerate(sample_images):
+                result_parts.append(text_parts[i])
                 rows, cols, image_size = next(image_data)
-                num_thumbnail_tokens, num_tokens_per_tile = self._get_image_num_tokens(image_size, **images_kwargs)
-                if rows > 1 or cols > 1:
-                    for row in range(rows):
-                        for col in range(cols):
-                            if use_image_special_tokens:
-                                sample_text_with_image_tokens += f"<|img_row_{row + 1}_col_{col + 1}|>"
-                            sample_text_with_image_tokens += self.image_token * num_tokens_per_tile
-                    if num_thumbnail_tokens > 0:
-                        if use_image_special_tokens:
-                            sample_text_with_image_tokens += self.image_thumbnail_token
-                        sample_text_with_image_tokens += self.image_token * num_thumbnail_tokens
-                else:
-                    sample_text_with_image_tokens += self.image_token * num_thumbnail_tokens
+                tokens_per_tile, tokens_for_image = self._get_image_num_tokens(image_size, **images_kwargs)
+                image_tokens = self._build_image_tokens(
+                    rows,
+                    cols,
+                    tokens_per_tile,
+                    tokens_for_image,
+                    use_thumbnail,
+                    use_image_special_tokens,
+                )
+                result_parts.append(image_tokens)
-                if use_image_special_tokens:
-                    sample_text_with_image_tokens += self.image_end_token
+            # Add remaining text after the last image
+            if len(sample_images) < len(text_parts):
+                result_parts.append(text_parts[-1])
-                sample_text_with_image_tokens += split_sample[i + 1]
-            prompt_strings.append(sample_text_with_image_tokens)
+            prompt_strings.append("".join(result_parts))
         return prompt_strings
+    def _build_image_tokens(
+        self,
+        rows: int,
+        cols: int,
+        tokens_per_tile: int,
+        tokens_for_image: int,
+        use_thumbnail: bool,
+        use_image_special_tokens: bool,
+    ) -> str:
+        """Build the expanded token string for a single image."""
+        parts = []
+        if use_image_special_tokens:
+            parts.append(self.image_start_token)
+        is_multi_tile = rows > 1 or cols > 1
+        if is_multi_tile:
+            for row in range(rows):
+                for col in range(cols):
+                    if use_image_special_tokens:
+                        parts.append(f"<|img_row_{row + 1}_col_{col + 1}|>")
+                    parts.append(self.image_token * tokens_per_tile)
+            if use_thumbnail:
+                if use_image_special_tokens:
+                    parts.append(self.image_thumbnail_token)
+                parts.append(self.image_token * tokens_for_image)
+        else:
+            parts.append(self.image_token * tokens_for_image)
+        if use_image_special_tokens:
+            parts.append(self.image_end_token)
+        return "".join(parts)
+    def _compute_tokens_per_tile(self, tile_size: int, encoder_patch_size: int, downsample_factor: int) -> int:
+        """Compute the number of tokens for a single tile."""
+        num_patches = tile_size // encoder_patch_size
+        downsampled_patches = math.ceil(num_patches / downsample_factor)
+        return downsampled_patches * downsampled_patches
+    def _compute_tokens_for_image(self, image_size: list[int], encoder_patch_size: int, downsample_factor: int) -> int:
+        """Compute the number of tokens for a resized image (used for single-tile or thumbnail)."""
+        image_height, image_width = image_size
+        patches_h = math.ceil((image_height // encoder_patch_size) / downsample_factor)
+        patches_w = math.ceil((image_width // encoder_patch_size) / downsample_factor)
+        return patches_h * patches_w
     def _get_image_num_tokens(self, image_size: list[int], **images_kwargs) -> tuple[int, int]:
+        """
+        Compute token counts for image processing.
+        Returns:
+            tuple[int, int]: (tokens_per_tile, tokens_for_image)
+                - tokens_per_tile: tokens for each tile in multi-tile mode
+                - tokens_for_image: tokens for the resized image (single-tile) or thumbnail (multi-tile)
+        """
         tile_size = images_kwargs.get("tile_size", self.image_processor.tile_size)
         downsample_factor = images_kwargs.get("downsample_factor", self.image_processor.downsample_factor)
         encoder_patch_size = images_kwargs.get("encoder_patch_size", self.image_processor.encoder_patch_size)
-        use_thumbnail = images_kwargs.get("use_thumbnail", self.image_processor.use_thumbnail)
-        thumbnail_tokens = 0
-        if use_thumbnail:
-            image_height, image_width = image_size
-            num_patches_height = image_height // encoder_patch_size
-            num_patches_width = image_width // encoder_patch_size
-            dwn_num_patches_height = math.ceil(num_patches_height / downsample_factor)
-            dwn_num_patches_width = math.ceil(num_patches_width / downsample_factor)
-            thumbnail_tokens = dwn_num_patches_height * dwn_num_patches_width
-        num_patches_tile = tile_size // encoder_patch_size
-        dwn_num_patches_tile = math.ceil(num_patches_tile / downsample_factor)
-        tile_tokens = dwn_num_patches_tile * dwn_num_patches_tile
+        tokens_per_tile = self._compute_tokens_per_tile(tile_size, encoder_patch_size, downsample_factor)
+        tokens_for_image = self._compute_tokens_for_image(image_size, encoder_patch_size, downsample_factor)
-        return thumbnail_tokens, tile_tokens
+        return tokens_per_tile, tokens_for_image
     def batch_decode(self, *args, **kwargs):
         """

transformers/models/lightglue/image_processing_lightglue_fast.py CHANGED Viewed

@@ -174,9 +174,8 @@ class LightGlueImageProcessorFast(BaseImageProcessorFast):
         stacked_pairs = [torch.stack(pair, dim=0) for pair in image_pairs]
         # Return in same format as slow processor
-        image_pairs = torch.stack(stacked_pairs, dim=0) if return_tensors else stacked_pairs
-        return BatchFeature(data={"pixel_values": image_pairs})
+        return BatchFeature(data={"pixel_values": stacked_pairs}, tensor_type=return_tensors)
     def post_process_keypoint_matching(
         self,

transformers/models/lilt/modeling_lilt.py CHANGED Viewed

@@ -21,6 +21,7 @@ import torch
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from ... import initialization as init
 from ...activations import ACT2FN
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import (
@@ -279,11 +280,9 @@ class LiltSelfAttention(nn.Module):
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(*new_context_layer_shape)
-        outputs = (
-            ((context_layer, layout_context_layer), attention_probs)
-            if output_attentions
-            else ((context_layer, layout_context_layer),)
-        )
+        outputs = (context_layer, layout_context_layer)
+        if output_attentions:
+            outputs = outputs + (attention_probs,)
         return outputs
@@ -327,9 +326,9 @@ class LiltAttention(nn.Module):
             attention_mask,
             output_attentions,
         )
-        attention_output = self.output(self_outputs[0][0], hidden_states)
-        layout_attention_output = self.layout_output(self_outputs[0][1], layout_inputs)
-        outputs = ((attention_output, layout_attention_output),) + self_outputs[1:]  # add attentions if we output them
+        attention_output = self.output(self_outputs[0], hidden_states)
+        layout_attention_output = self.layout_output(self_outputs[1], layout_inputs)
+        outputs = (attention_output, layout_attention_output) + self_outputs[2:]  # add attentions if we output them
         return outputs
@@ -395,10 +394,10 @@ class LiltLayer(GradientCheckpointingLayer):
             attention_mask,
             output_attentions=output_attentions,
         )
-        attention_output = self_attention_outputs[0][0]
-        layout_attention_output = self_attention_outputs[0][1]
+        attention_output = self_attention_outputs[0]
+        layout_attention_output = self_attention_outputs[1]
-        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        outputs = self_attention_outputs[2:]  # add self attentions if we output attention weights
         layer_output = apply_chunking_to_forward(
             self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
@@ -406,7 +405,7 @@ class LiltLayer(GradientCheckpointingLayer):
         layout_layer_output = apply_chunking_to_forward(
             self.layout_feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, layout_attention_output
         )
-        outputs = ((layer_output, layout_layer_output),) + outputs
+        outputs = (layer_output, layout_layer_output) + outputs
         return outputs
@@ -451,11 +450,11 @@ class LiltEncoder(nn.Module):
                 output_attentions,
             )
-            hidden_states = layer_outputs[0][0]
-            layout_inputs = layer_outputs[0][1]
+            hidden_states = layer_outputs[0]
+            layout_inputs = layer_outputs[1]
             if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_self_attentions = all_self_attentions + (layer_outputs[2],)
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
@@ -500,6 +499,11 @@ class LiltPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = []
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, LiltTextEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
 @auto_docstring
 class LiltModel(LiltPreTrainedModel):

transformers/models/llama/modeling_llama.py CHANGED Viewed

@@ -87,7 +87,7 @@ class LlamaRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(

transformers/models/llama4/image_processing_llama4_fast.py CHANGED Viewed

@@ -419,10 +419,9 @@ class Llama4ImageProcessorFast(BaseImageProcessorFast):
                 )
                 grouped_processed_images[shape] = torch.cat([processed_images, global_tiles.unsqueeze(1)], dim=1)
         processed_images = reorder_images(grouped_processed_images, grouped_images_index)
-        aspect_ratios_list = reorder_images(grouped_aspect_ratios, grouped_images_index)
+        aspect_ratios = reorder_images(grouped_aspect_ratios, grouped_images_index)
         processed_images = torch.cat(processed_images, dim=0) if return_tensors else processed_images
-        aspect_ratios = torch.stack(aspect_ratios_list, dim=0) if return_tensors else aspect_ratios_list
         return BatchFeature(
             data={"pixel_values": processed_images, "aspect_ratios": aspect_ratios}, tensor_type=return_tensors
         )

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl