PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/models/deepseek_vl/modeling_deepseek_vl.py CHANGED Viewed

@@ -196,7 +196,7 @@ class DeepseekVLModel(DeepseekVLPreTrainedModel):
         use_cache: Optional[bool] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs,
-    ):
+    ) -> DeepseekVLBaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
@@ -268,7 +268,7 @@ class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMi
         use_cache: Optional[bool] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> DeepseekVLCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -315,6 +315,7 @@ class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMi
         inputs_embeds=None,
         cache_position=None,
         logits_to_keep=None,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- extra custom processing
@@ -326,12 +327,15 @@ class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMi
             attention_mask=attention_mask,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
+            is_first_iteration=is_first_iteration,
             **kwargs,
         )
-        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-        # Otherwise we need pixel values to be passed to model
-        if cache_position[0] == 0:
+        # Pixel values are used only in the first iteration if available
+        # In subsquent iterations, they are already merged with text and cached
+        # NOTE: first iteration doesn't have to be prefill, it can be the first
+        # iteration with a question and cached system prompt (continue generate from cache)
+        if is_first_iteration or not kwargs.get("use_cache", True):
             model_inputs["pixel_values"] = pixel_values
         return model_inputs

transformers/models/deepseek_vl/modular_deepseek_vl.py CHANGED Viewed

@@ -134,6 +134,9 @@ class DeepseekVLAligner(nn.Module):
 class DeepseekVLPreTrainedModel(JanusPreTrainedModel):
     _no_split_modules = ["LlamaDecoderLayer"]
+    def _init_weights(self, module):
+        raise AttributeError("No need to inherit!")
 @auto_docstring
 class DeepseekVLModel(JanusModel):

transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py CHANGED Viewed

@@ -207,9 +207,6 @@ class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast):
             )
             high_res_processed_images_grouped[shape] = stacked_high_res_images
         high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index)
-        high_res_processed_images = (
-            torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images
-        )
         resized_images_grouped = {}
         for shape, stacked_high_res_padded_images in high_res_padded_images.items():
@@ -233,7 +230,6 @@ class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast):
             )
             processed_images_grouped[shape] = stacked_images
         processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index)
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         return BatchFeature(
             data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images},

transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py CHANGED Viewed

@@ -314,7 +314,7 @@ class DeepseekVLHybridModel(DeepseekVLHybridPreTrainedModel):
         use_cache: Optional[bool] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs,
-    ):
+    ) -> DeepseekVLHybridBaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
@@ -424,7 +424,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLHybridPreTrainedModel,
         use_cache: Optional[bool] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> DeepseekVLHybridCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -473,6 +473,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLHybridPreTrainedModel,
         attention_mask=None,
         cache_position=None,
         logits_to_keep=None,
+        is_first_iteration=False,
         **kwargs,
     ):
         model_inputs = super().prepare_inputs_for_generation(
@@ -482,12 +483,15 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLHybridPreTrainedModel,
             attention_mask=attention_mask,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
+            is_first_iteration=is_first_iteration,
             **kwargs,
         )
-        if cache_position[0] == 0:
-            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-            # Otherwise we need pixel values to be passed to model
+        if is_first_iteration or not kwargs.get("use_cache", True):
+            # Pixel values are used only in the first iteration if available
+            # In subsquent iterations, they are already merged with text and cached
+            # NOTE: first iteration doesn't have to be prefill, it can be the first
+            # iteration with a question and cached system prompt (continue generate from cache)
             model_inputs["pixel_values"] = pixel_values
             model_inputs["high_res_pixel_values"] = high_res_pixel_values

transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py CHANGED Viewed

@@ -297,7 +297,7 @@ class DeepseekVLHybridModel(DeepseekVLModel):
         use_cache: Optional[bool] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs,
-    ):
+    ) -> DeepseekVLHybridBaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
@@ -361,7 +361,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
         use_cache: Optional[bool] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> DeepseekVLHybridCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -410,6 +410,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
         attention_mask=None,
         cache_position=None,
         logits_to_keep=None,
+        is_first_iteration=False,
         **kwargs,
     ):
         model_inputs = super().prepare_inputs_for_generation(
@@ -419,12 +420,15 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
             attention_mask=attention_mask,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
+            is_first_iteration=is_first_iteration,
             **kwargs,
         )
-        if cache_position[0] == 0:
-            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-            # Otherwise we need pixel values to be passed to model
+        if is_first_iteration or not kwargs.get("use_cache", True):
+            # Pixel values are used only in the first iteration if available
+            # In subsquent iterations, they are already merged with text and cached
+            # NOTE: first iteration doesn't have to be prefill, it can be the first
+            # iteration with a question and cached system prompt (continue generate from cache)
             model_inputs["pixel_values"] = pixel_values
             model_inputs["high_res_pixel_values"] = high_res_pixel_values
@@ -888,9 +892,6 @@ class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
             )
             high_res_processed_images_grouped[shape] = stacked_high_res_images
         high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index)
-        high_res_processed_images = (
-            torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images
-        )
         resized_images_grouped = {}
         for shape, stacked_high_res_padded_images in high_res_padded_images.items():
@@ -914,7 +915,6 @@ class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
             )
             processed_images_grouped[shape] = stacked_images
         processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index)
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         return BatchFeature(
             data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images},

transformers/models/deformable_detr/configuration_deformable_detr.py CHANGED Viewed

@@ -37,7 +37,7 @@ class DeformableDetrConfig(PreTrainedConfig):
         use_timm_backbone (`bool`, *optional*, defaults to `True`):
             Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
             API.
-        backbone_config (`PreTrainedConfig` or `dict`, *optional*):
+        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `ResNetConfig()`):
             The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
             case it will default to `ResNetConfig()`.
         num_channels (`int`, *optional*, defaults to 3):
@@ -269,8 +269,8 @@ class DeformableDetrConfig(PreTrainedConfig):
         self.eos_coefficient = eos_coefficient
         self.focal_alpha = focal_alpha
         self.disable_custom_kernels = disable_custom_kernels
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
-        self.tie_encoder_decoder = True
 __all__ = ["DeformableDetrConfig"]

transformers/models/deformable_detr/modeling_deformable_detr.py CHANGED Viewed

@@ -956,7 +956,7 @@ class DeformableDetrPreTrainedModel(PreTrainedModel):
             init.constant_(module.value_proj.bias, 0.0)
             init.xavier_uniform_(module.output_proj.weight)
             init.constant_(module.output_proj.bias, 0.0)
-        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
             init.normal_(module.weight, mean=0.0, std=std)
             if module.bias is not None:
                 init.zeros_(module.bias)

transformers/models/depth_anything/configuration_depth_anything.py CHANGED Viewed

@@ -34,9 +34,8 @@ class DepthAnythingConfig(PreTrainedConfig):
     documentation from [`PreTrainedConfig`] for more information.
     Args:
-        backbone_config (`Union[dict[str, Any], PreTrainedConfig]`, *optional*):
-            The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to
-            leverage the [`AutoBackbone`] API.
+        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `Dinov2Config()`):
+            The configuration of the backbone model.
         backbone (`str`, *optional*):
             Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
             will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`

transformers/models/depth_pro/image_processing_depth_pro_fast.py CHANGED Viewed

@@ -94,7 +94,6 @@ class DepthProImageProcessorFast(BaseImageProcessorFast):
             processed_images_grouped[shape] = stacked_images
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

transformers/models/detr/configuration_detr.py CHANGED Viewed

@@ -37,7 +37,7 @@ class DetrConfig(PreTrainedConfig):
         use_timm_backbone (`bool`, *optional*, defaults to `True`):
             Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
             API.
-        backbone_config (`PreTrainedConfig` or `dict`, *optional*):
+        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `ResNetConfig()`):
             The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
             case it will default to `ResNetConfig()`.
         num_channels (`int`, *optional*, defaults to 3):

transformers/models/detr/modeling_detr.py CHANGED Viewed

@@ -741,7 +741,7 @@ class DetrPreTrainedModel(PreTrainedModel):
         elif isinstance(module, DetrLearnedPositionEmbedding):
             init.uniform_(module.row_embeddings.weight)
             init.uniform_(module.column_embeddings.weight)
-        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
             init.normal_(module.weight, mean=0.0, std=std)
             if module.bias is not None:
                 init.zeros_(module.bias)
@@ -750,6 +750,9 @@ class DetrPreTrainedModel(PreTrainedModel):
             # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
             if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
                 init.zeros_(module.weight[module.padding_idx])
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            init.ones_(module.weight)
+            init.zeros_(module.bias)
 class DetrEncoder(DetrPreTrainedModel):
@@ -1457,8 +1460,12 @@ class DetrForSegmentation(DetrPreTrainedModel):
         >>> # A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found
         >>> panoptic_seg = result[0]["segmentation"]
+        >>> panoptic_seg.shape
+        torch.Size([300, 500])
         >>> # Get prediction score and segment_id to class_id mapping of each segment
         >>> panoptic_segments_info = result[0]["segments_info"]
+        >>> len(panoptic_segments_info)
+        5
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict

transformers/models/dia/generation_dia.py CHANGED Viewed

@@ -110,11 +110,9 @@ class DiaGenerationMixin(GenerationMixin):
         return merged_processors
     def _prepare_generation_config(
-        self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: Any
+        self, generation_config: Optional[GenerationConfig], **kwargs: Any
     ) -> tuple[GenerationConfig, dict]:
-        generation_config, model_kwargs = super()._prepare_generation_config(
-            generation_config, use_model_defaults, **kwargs
-        )
+        generation_config, model_kwargs = super()._prepare_generation_config(generation_config, **kwargs)
         # We allow generation up to max length + max delay pattern
         # (will revert back to max length after generation)
@@ -260,7 +258,6 @@ class DiaGenerationMixin(GenerationMixin):
         streamer: Optional["BaseStreamer"] = None,
         negative_prompt_ids: Optional[torch.Tensor] = None,
         negative_prompt_attention_mask: Optional[torch.Tensor] = None,
-        use_model_defaults: Optional[bool] = None,
         custom_generate: Optional[str] = None,
         **kwargs,
     ):
@@ -273,9 +270,7 @@ class DiaGenerationMixin(GenerationMixin):
             assistant_model,
             streamer,
         )
-        generation_config, model_kwargs = self._prepare_generation_config(
-            generation_config, use_model_defaults, **kwargs
-        )
+        generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
         generation_mode = generation_config.get_generation_mode(assistant_model)
         if generation_mode not in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
@@ -425,7 +420,6 @@ class DiaGenerationMixin(GenerationMixin):
         streamer: Optional["BaseStreamer"] = None,
         negative_prompt_ids: Optional[torch.Tensor] = None,
         negative_prompt_attention_mask: Optional[torch.Tensor] = None,
-        use_model_defaults: Optional[bool] = None,
         custom_generate: Optional[str] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
@@ -445,7 +439,6 @@ class DiaGenerationMixin(GenerationMixin):
             streamer=streamer,
             negative_prompt_ids=negative_prompt_ids,
             negative_prompt_attention_mask=negative_prompt_attention_mask,
-            use_model_defaults=use_model_defaults,
             custom_generate=custom_generate,
             **kwargs,
         )

transformers/models/dia/modeling_dia.py CHANGED Viewed

@@ -25,6 +25,7 @@ from typing import Optional, Union
 import torch
 from torch import nn
+from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
@@ -61,6 +62,12 @@ class DiaPreTrainedModel(PreTrainedModel):
     main_input_name = "input_ids"
     _no_split_modules = ["DiaEncoderLayer", "DiaDecoderLayer"]
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, DiaMultiChannelEmbedding):
+            offsets = torch.arange(self.config.num_channels, dtype=torch.long) * self.config.vocab_size
+            init.copy_(module.offsets, offsets)
 class DiaMultiChannelEmbedding(nn.Module):
     """In order to efficiently compute the audio embedding from the 9 different channels,
@@ -146,7 +153,7 @@ class DiaRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(
@@ -452,6 +459,8 @@ class DiaEncoder(DiaPreTrainedModel):
         self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
         self.rotary_emb = DiaRotaryEmbedding(config=config)
+        self.post_init()
     @auto_docstring
     @can_return_tuple
     def forward(
@@ -578,6 +587,8 @@ class DiaDecoder(DiaPreTrainedModel):
         self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
         self.rotary_emb = DiaRotaryEmbedding(config=config)
+        self.post_init()
     @auto_docstring
     @can_return_tuple
     def forward(

transformers/models/dia/modular_dia.py CHANGED Viewed

@@ -20,6 +20,7 @@ from typing import Optional, Union
 import torch
 from torch import nn
+from ... import initialization as init
 from ...cache_utils import DynamicCache, EncoderDecoderCache
 from ...masking_utils import create_bidirectional_mask, create_causal_mask
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
@@ -59,6 +60,12 @@ class DiaPreTrainedModel(PreTrainedModel):
     main_input_name = "input_ids"
     _no_split_modules = ["DiaEncoderLayer", "DiaDecoderLayer"]
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, DiaMultiChannelEmbedding):
+            offsets = torch.arange(self.config.num_channels, dtype=torch.long) * self.config.vocab_size
+            init.copy_(module.offsets, offsets)
 class DiaMultiChannelEmbedding(nn.Module):
     """In order to efficiently compute the audio embedding from the 9 different channels,
@@ -241,6 +248,8 @@ class DiaEncoder(DiaPreTrainedModel):
         self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
         self.rotary_emb = DiaRotaryEmbedding(config=config)
+        self.post_init()
     @auto_docstring
     @can_return_tuple
     def forward(
@@ -367,6 +376,8 @@ class DiaDecoder(DiaPreTrainedModel):
         self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
         self.rotary_emb = DiaRotaryEmbedding(config=config)
+        self.post_init()
     @auto_docstring
     @can_return_tuple
     def forward(

transformers/models/dia/processing_dia.py CHANGED Viewed

@@ -74,7 +74,7 @@ class DiaProcessor(ProcessorMixin):
         tokenizer (`DiaTokenizer`):
             An instance of [`DiaTokenizer`]. The tokenizer is a required input.
         audio_tokenizer (`DacModel`):
-            An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input.
+            An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is a required input.
     """
     audio_tokenizer_class = "DacModel"

transformers/models/diffllama/modeling_diffllama.py CHANGED Viewed

@@ -86,7 +86,7 @@ class DiffLlamaRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(
@@ -361,8 +361,8 @@ class DiffLlamaFlashAttention2(DiffLlamaAttention):
                     else torch.get_autocast_gpu_dtype()
                 )
             # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
+            elif hasattr(self.config, "quantization_config"):
+                target_dtype = self.config.dtype
             else:
                 target_dtype = self.q_proj.weight.dtype

transformers/models/diffllama/modular_diffllama.py CHANGED Viewed

@@ -236,8 +236,8 @@ class DiffLlamaFlashAttention2(DiffLlamaAttention):
                     else torch.get_autocast_gpu_dtype()
                 )
             # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
+            elif hasattr(self.config, "quantization_config"):
+                target_dtype = self.config.dtype
             else:
                 target_dtype = self.q_proj.weight.dtype

transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py CHANGED Viewed

@@ -88,7 +88,6 @@ class DINOv3ViTImageProcessorFast(BaseImageProcessorFast):
             processed_images_grouped[shape] = stacked_images
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

transformers/models/dinov3_vit/modeling_dinov3_vit.py CHANGED Viewed

@@ -466,6 +466,9 @@ class DINOv3ViTPreTrainedModel(PreTrainedModel):
             init.zeros_(module.mask_token)
         elif isinstance(module, DINOv3ViTLayerScale):
             init.constant_(module.lambda1, self.config.layerscale_value)
+        elif isinstance(module, DINOv3ViTRopePositionEmbedding):
+            inv_freq = 1 / module.base ** torch.arange(0, 1, 4 / module.head_dim, dtype=torch.float32)
+            init.copy_(module.inv_freq, inv_freq)
 @auto_docstring

transformers/models/dinov3_vit/modular_dinov3_vit.py CHANGED Viewed

@@ -361,6 +361,9 @@ class DINOv3ViTPreTrainedModel(Dinov2PreTrainedModel):
             init.zeros_(module.mask_token)
         elif isinstance(module, DINOv3ViTLayerScale):
             init.constant_(module.lambda1, self.config.layerscale_value)
+        elif isinstance(module, DINOv3ViTRopePositionEmbedding):
+            inv_freq = 1 / module.base ** torch.arange(0, 1, 4 / module.head_dim, dtype=torch.float32)
+            init.copy_(module.inv_freq, inv_freq)
 @auto_docstring

transformers/models/distilbert/modeling_distilbert.py CHANGED Viewed

@@ -305,15 +305,17 @@ class DistilBertPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module: nn.Module):
         """Initialize the weights."""
         super()._init_weights(module)
-        if isinstance(module, Embeddings) and self.config.sinusoidal_pos_embds:
-            init.copy_(
-                module.position_embeddings.weight,
-                create_sinusoidal_embeddings(
-                    self.config.max_position_embeddings,
-                    self.config.dim,
-                    torch.empty_like(module.position_embeddings.weight),
-                ),
-            )
+        if isinstance(module, Embeddings):
+            if self.config.sinusoidal_pos_embds:
+                init.copy_(
+                    module.position_embeddings.weight,
+                    create_sinusoidal_embeddings(
+                        self.config.max_position_embeddings,
+                        self.config.dim,
+                        torch.empty_like(module.position_embeddings.weight),
+                    ),
+                )
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
 @auto_docstring

transformers/models/doge/modeling_doge.py CHANGED Viewed

@@ -88,7 +88,7 @@ class DogeRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(

transformers/models/donut/image_processing_donut_fast.py CHANGED Viewed

@@ -231,7 +231,6 @@ class DonutImageProcessorFast(BaseImageProcessorFast):
             processed_images_grouped[shape] = stacked_images
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

transformers/models/donut/modeling_donut_swin.py CHANGED Viewed

@@ -381,18 +381,7 @@ class DonutSwinSelfAttention(nn.Module):
             torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
         )
-        # get pair-wise relative position index for each token inside the window
-        coords_h = torch.arange(self.window_size[0])
-        coords_w = torch.arange(self.window_size[1])
-        coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
-        coords_flatten = torch.flatten(coords, 1)
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
-        relative_coords[:, :, 0] += self.window_size[0] - 1
-        relative_coords[:, :, 1] += self.window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
-        relative_position_index = relative_coords.sum(-1)
-        self.register_buffer("relative_position_index", relative_position_index)
+        self.register_buffer("relative_position_index", self.create_relative_position_index())
         self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
         self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
@@ -451,6 +440,20 @@ class DonutSwinSelfAttention(nn.Module):
         return outputs
+    def create_relative_position_index(self):
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+        return relative_position_index
 # Copied from transformers.models.swin.modeling_swin.SwinSelfOutput
 class DonutSwinSelfOutput(nn.Module):
@@ -801,6 +804,7 @@ class DonutSwinPreTrainedModel(PreTrainedModel):
                 init.zeros_(module.position_embeddings)
         elif isinstance(module, DonutSwinSelfAttention):
             init.zeros_(module.relative_position_bias_table)
+            init.copy_(module.relative_position_index, module.create_relative_position_index())
 @auto_docstring

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl