PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/timm_wrapper/modeling_timm_wrapper.py CHANGED Viewed

@@ -115,10 +115,14 @@ class TimmWrapperPreTrainedModel(PreTrainedModel):
         # Also, reinit all non-persistemt buffers if any!
         if hasattr(module, "init_non_persistent_buffers"):
             module.init_non_persistent_buffers()
-        elif isinstance(module, nn.BatchNorm2d) and getattr(module, "running_mean", None) is not None:
-            init.zeros_(module.running_mean)
-            init.ones_(module.running_var)
-            init.zeros_(module.num_batches_tracked)
+        elif isinstance(module, nn.BatchNorm2d):
+            # TimmWrapper always creates models with pretrained=False, so buffers are never pre-loaded
+            # Always initialize buffers (handles both meta device and to_empty() cases)
+            running_mean = getattr(module, "running_mean", None)
+            if running_mean is not None:
+                init.zeros_(module.running_mean)
+                init.ones_(module.running_var)
+                init.zeros_(module.num_batches_tracked)
     def _timm_model_supports_gradient_checkpointing(self):
         """
@@ -168,6 +172,7 @@ class TimmWrapperModel(TimmWrapperPreTrainedModel):
         output_hidden_states: bool | list[int] | None = None,
         return_dict: bool | None = None,
         do_pooling: bool | None = None,
+        use_cache: bool | None = None,
         **kwargs,
     ) -> TimmWrapperModelOutput | tuple[Tensor, ...]:
         r"""

transformers/models/trocr/configuration_trocr.py CHANGED Viewed

@@ -114,8 +114,14 @@ class TrOCRConfig(PreTrainedConfig):
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
+        cross_attention_hidden_size=None,
+        is_decoder=False,
+        tie_word_embeddings=True,
         **kwargs,
     ):
+        self.cross_attention_hidden_size = cross_attention_hidden_size
+        self.is_decoder = is_decoder
+        self.tie_word_embeddings = tie_word_embeddings
         self.vocab_size = vocab_size
         self.d_model = d_model
         self.decoder_layers = decoder_layers
@@ -132,14 +138,12 @@ class TrOCRConfig(PreTrainedConfig):
         self.scale_embedding = scale_embedding
         self.use_learned_position_embeddings = use_learned_position_embeddings
         self.layernorm_embedding = layernorm_embedding
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.decoder_start_token_id = decoder_start_token_id
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            decoder_start_token_id=decoder_start_token_id,
-            **kwargs,
-        )
+        super().__init__(**kwargs)
 __all__ = ["TrOCRConfig"]

transformers/models/trocr/modeling_trocr.py CHANGED Viewed

@@ -703,7 +703,8 @@ class TrOCRForCausalLM(TrOCRPreTrainedModel, GenerationMixin):
         ...     ViTModel,
         ...     VisionEncoderDecoderModel,
         ... )
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> from PIL import Image
         >>> # TrOCR is a decoder model and should be used within a VisionEncoderDecoderModel
@@ -718,7 +719,8 @@ class TrOCRForCausalLM(TrOCRPreTrainedModel, GenerationMixin):
         >>> # load image from the IAM dataset
         >>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read())).convert("RGB")
         >>> pixel_values = processor(image, return_tensors="pt").pixel_values
         >>> text = "industry, ' Mr. Brown commented icily. ' Let us have a"

transformers/models/tvp/configuration_tvp.py CHANGED Viewed

@@ -13,10 +13,10 @@
 # limitations under the License.
 """TVP model configuration"""
+from ...backbone_utils import consolidate_backbone_kwargs_to_config
 from ...configuration_utils import PreTrainedConfig
 from ...utils import logging
-from ...utils.backbone_utils import verify_backbone_config_arguments
-from ..auto import CONFIG_MAPPING, AutoConfig
+from ..auto import AutoConfig
 logger = logging.get_logger(__name__)
@@ -36,18 +36,6 @@ class TvpConfig(PreTrainedConfig):
     Args:
         backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `ResNetConfig()`):
             The configuration of the backbone model.
-        backbone (`str`, *optional*):
-            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
-            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
-            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
-        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
-            library.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         distance_loss_weight (`float`, *optional*, defaults to 1.0):
             The weight of distance loss.
         duration_loss_weight (`float`, *optional*, defaults to 0.1):
@@ -95,6 +83,8 @@ class TvpConfig(PreTrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability of attention layers.
+        pad_token_id (`int`, *optional*):
+            The id of a PAD token in the vocabulary.
     """
     model_type = "tvp"
@@ -103,10 +93,6 @@ class TvpConfig(PreTrainedConfig):
     def __init__(
         self,
         backbone_config=None,
-        backbone=None,
-        use_pretrained_backbone=False,
-        use_timm_backbone=False,
-        backbone_kwargs=None,
         distance_loss_weight=1.0,
         duration_loss_weight=0.1,
         visual_prompter_type="framepad",
@@ -128,29 +114,17 @@ class TvpConfig(PreTrainedConfig):
         layer_norm_eps=1e-12,
         initializer_range=0.02,
         attention_probs_dropout_prob=0.1,
+        pad_token_id=None,
         **kwargs,
     ):
-        if backbone_config is None and backbone is None:
-            logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
-            backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
-        elif isinstance(backbone_config, dict):
-            backbone_model_type = backbone_config.get("model_type")
-            config_class = CONFIG_MAPPING[backbone_model_type]
-            backbone_config = config_class.from_dict(backbone_config)
-        verify_backbone_config_arguments(
-            use_timm_backbone=use_timm_backbone,
-            use_pretrained_backbone=use_pretrained_backbone,
-            backbone=backbone,
+        backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
             backbone_config=backbone_config,
-            backbone_kwargs=backbone_kwargs,
+            default_config_type="resnet",
+            default_config_kwargs={"out_features": ["stage4"]},
+            **kwargs,
         )
         self.backbone_config = backbone_config
-        self.backbone = backbone
-        self.use_pretrained_backbone = use_pretrained_backbone
-        self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs
         self.distance_loss_weight = distance_loss_weight
         self.duration_loss_weight = duration_loss_weight
         self.visual_prompter_type = visual_prompter_type
@@ -172,6 +146,7 @@ class TvpConfig(PreTrainedConfig):
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.pad_token_id = pad_token_id
         super().__init__(**kwargs)

transformers/models/tvp/image_processing_tvp_fast.py CHANGED Viewed

@@ -16,7 +16,7 @@
 from typing import Optional
 import torch
-from torchvision.transforms.v2 import functional as F
+import torchvision.transforms.v2.functional as tvF
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
@@ -83,13 +83,14 @@ class TvpImageProcessorFast(BaseImageProcessorFast):
         Returns:
             `ImageInput`: The images with a valid nesting.
         """
+        images = self.fetch_images(images)
         return make_nested_list_of_images(images, **kwargs)
     def resize(
         self,
         image: "torch.Tensor",
         size: SizeDict,
-        interpolation: Optional["F.InterpolationMode"] = None,
+        interpolation: Optional["tvF.InterpolationMode"] = None,
         antialias: bool = True,
         **kwargs,
     ) -> "torch.Tensor":
@@ -102,7 +103,7 @@ class TvpImageProcessorFast(BaseImageProcessorFast):
             size (`SizeDict` or `dict`):
                 Size dictionary. If `size` has `longest_edge`, resize the longest edge to that value
                 while maintaining aspect ratio. Otherwise, use the base class resize method.
-            interpolation (`F.InterpolationMode`, *optional*):
+            interpolation (`tvF.InterpolationMode`, *optional*):
                 Interpolation method to use.
             antialias (`bool`, *optional*, defaults to `True`):
                 Whether to use antialiasing.
@@ -110,7 +111,7 @@ class TvpImageProcessorFast(BaseImageProcessorFast):
         Returns:
             `torch.Tensor`: The resized image.
         """
-        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        interpolation = interpolation if interpolation is not None else tvF.InterpolationMode.BILINEAR
         # Handle longest_edge case (TVP-specific)
         if size.longest_edge:
@@ -157,7 +158,7 @@ class TvpImageProcessorFast(BaseImageProcessorFast):
         images: list[list["torch.Tensor"]],
         do_resize: bool,
         size: SizeDict | dict,
-        interpolation: Optional["F.InterpolationMode"],
+        interpolation: Optional["tvF.InterpolationMode"],
         do_center_crop: bool,
         crop_size: SizeDict | dict,
         do_rescale: bool,

transformers/models/tvp/modeling_tvp.py CHANGED Viewed

@@ -21,11 +21,11 @@ from torch import nn
 from ... import initialization as init
 from ...activations import ACT2FN
+from ...backbone_utils import load_backbone
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, logging
-from ...utils.backbone_utils import load_backbone
 from .configuration_tvp import TvpConfig

transformers/models/udop/configuration_udop.py CHANGED Viewed

@@ -79,6 +79,13 @@ class UdopConfig(PreTrainedConfig):
             The patch size used by the vision encoder.
         num_channels (`int`, *optional*, defaults to 3):
             The number of channels in the input images.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether to only use the decoder in an encoder-decoder architecture, otherwise it has no effect on
+            decoder-only or encoder-only architectures.
+        add_cross_attention (`bool`, *optional*, defaults to `False`):
+            Whether cross-attention layers should be added to the model.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
     """
     model_type = "udop"
@@ -109,8 +116,13 @@ class UdopConfig(PreTrainedConfig):
         image_size=224,
         patch_size=16,
         num_channels=3,
+        is_decoder=False,
+        add_cross_attention=False,
+        tie_word_embeddings=True,
         **kwargs,
     ):
+        self.is_decoder = is_decoder
+        self.add_cross_attention = add_cross_attention
         self.vocab_size = vocab_size
         self.d_model = d_model
         self.d_kv = d_kv
@@ -127,6 +139,8 @@ class UdopConfig(PreTrainedConfig):
         self.initializer_factor = initializer_factor
         self.feed_forward_proj = feed_forward_proj
         self.use_cache = use_cache
+        self.pad_token_id = pad_token_id
+        self.eos_token_id = eos_token_id
         # UDOP attributes
         self.max_2d_position_embeddings = max_2d_position_embeddings
@@ -148,13 +162,8 @@ class UdopConfig(PreTrainedConfig):
                 "'gated-gelu' or 'relu'"
             )
-        kwargs["tie_word_embeddings"] = True
-        super().__init__(
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-            is_encoder_decoder=is_encoder_decoder,
-            **kwargs,
-        )
+        self.tie_word_embeddings = True
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 __all__ = ["UdopConfig"]

transformers/models/udop/modeling_udop.py CHANGED Viewed

@@ -46,6 +46,7 @@ from ...utils import (
     is_torch_flex_attn_available,
     is_torchdynamo_compiling,
 )
+from ...utils.generic import is_flash_attention_requested
 if is_torch_flex_attn_available():
@@ -195,8 +196,11 @@ def combine_image_text_embeddings(
         visual_bbox = visual_bbox.to(image_embeddings.device)
     visual_bbox = [visual_bbox[i][patch_inds[i]] for i in range(len(patch_inds))]
     if attention_mask is not None:
-        visual_attention_mask = [torch.tensor([1] * len(item)).to(attention_mask) for item in visual_bbox]
+        visual_attention_mask = [
+            torch.ones(item.size(0), dtype=attention_mask.dtype, device=attention_mask.device) for item in visual_bbox
+        ]
     if max_len == 0:
         max_len = image_embeddings.size(1)
@@ -1297,7 +1301,7 @@ class UdopStack(UdopPreTrainedModel):
         past_key_values: Cache,
         output_attentions: bool = False,
     ):
-        if self.config._attn_implementation == "flash_attention_2":
+        if is_flash_attention_requested(self.config):
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
@@ -1419,8 +1423,8 @@ class UdopModel(UdopPreTrainedModel):
     _tied_weights_keys = {
         "encoder.embed_tokens.weight": "shared.weight",
         "decoder.embed_tokens.weight": "shared.weight",
-        "encoder.embed_patches.proj.weight": "patch_embed.proj.weight",  # TODO tie weights for patch embeddings not working
-        "encoder.embed_patches.proj.bias": "patch_embed.proj.bias",  # TODO tie weights for patch embeddings not working
+        "encoder.embed_patches.proj.weight": "patch_embed.proj.weight",
+        "encoder.embed_patches.proj.bias": "patch_embed.proj.bias",
     }
     def __init__(self, config):
@@ -1465,7 +1469,7 @@ class UdopModel(UdopPreTrainedModel):
         encoder_outputs: Tensor | None = None,
         past_key_values: Cache | None = None,
         decoder_inputs_embeds: Tensor | None = None,
-        use_cache=True,
+        use_cache: bool | None = None,
         output_attentions: bool | None = None,
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
@@ -1641,7 +1645,7 @@ class UdopForConditionalGeneration(UdopPreTrainedModel, GenerationMixin):
         encoder_outputs: Tensor | None = None,
         past_key_values: Cache | None = None,
         decoder_inputs_embeds: Tensor | None = None,
-        use_cache=True,
+        use_cache: bool | None = None,
         output_attentions: bool | None = None,
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,

transformers/models/umt5/configuration_umt5.py CHANGED Viewed

@@ -97,8 +97,10 @@ class UMT5Config(PreTrainedConfig):
         eos_token_id=1,
         decoder_start_token_id=0,
         classifier_dropout=0.0,
+        is_decoder=False,
         **kwargs,
     ):
+        self.is_decoder = is_decoder
         self.vocab_size = vocab_size
         self.d_model = d_model
         self.d_kv = d_kv
@@ -131,14 +133,14 @@ class UMT5Config(PreTrainedConfig):
         if feed_forward_proj == "gated-gelu":
             self.dense_act_fn = "gelu_new"
-        # Force because official weights have False serialized, but we have to tie always
-        kwargs["tie_word_embeddings"] = True
+        self.tokenizer_class = tokenizer_class
+        self.pad_token_id = pad_token_id
+        self.eos_token_id = eos_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.tie_word_embeddings = True  # force it for T5 family
         super().__init__(
             is_encoder_decoder=is_encoder_decoder,
-            tokenizer_class=tokenizer_class,
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-            decoder_start_token_id=decoder_start_token_id,
             **kwargs,
         )

transformers/models/umt5/modeling_umt5.py CHANGED Viewed

@@ -44,7 +44,9 @@ from ...utils import (
     is_torch_flex_attn_available,
     is_torchdynamo_compiling,
     logging,
+    torch_compilable_check,
 )
+from ...utils.generic import is_flash_attention_requested
 from .configuration_umt5 import UMT5Config
@@ -778,7 +780,7 @@ class UMT5Stack(UMT5PreTrainedModel):
         past_key_values: Cache,
         output_attentions: bool = False,
     ):
-        if self.config._attn_implementation == "flash_attention_2":
+        if is_flash_attention_requested(self.config):
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
@@ -1472,8 +1474,10 @@ class UMT5ForSequenceClassification(UMT5PreTrainedModel):
         eos_mask = input_ids.eq(self.config.eos_token_id).to(sequence_output.device)
-        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
-            raise ValueError("All examples must have the same number of <eos> tokens.")
+        torch_compilable_check(
+            torch.unique_consecutive(eos_mask.sum(1)).numel() == 1,
+            "All examples must have the same number of <eos> tokens.",
+        )
         batch_size, _, hidden_size = sequence_output.shape
         sentence_representation = sequence_output[eos_mask, :].view(batch_size, -1, hidden_size)[:, -1, :]
         logits = self.classification_head(sentence_representation)

transformers/models/unispeech/configuration_unispeech.py CHANGED Viewed

@@ -233,7 +233,10 @@ class UniSpeechConfig(PreTrainedConfig):
         replace_prob=0.5,
         **kwargs,
     ):
-        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.hidden_size = hidden_size
         self.feat_extract_norm = feat_extract_norm
         self.feat_extract_activation = feat_extract_activation

transformers/models/unispeech/modeling_unispeech.py CHANGED Viewed

@@ -41,7 +41,7 @@ from ...modeling_outputs import (
     SequenceClassifierOutput,
     Wav2Vec2BaseModelOutput,
 )
-from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel, get_torch_context_manager_or_global_device
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, logging
 from .configuration_unispeech import UniSpeechConfig
@@ -356,9 +356,9 @@ class UniSpeechAttention(nn.Module):
         key_states = self.k_proj(current_states).view(*kv_input_shape).transpose(1, 2)
         value_states = self.v_proj(current_states).view(*kv_input_shape).transpose(1, 2)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -1218,6 +1218,9 @@ class UniSpeechForCTC(UniSpeechPreTrainedModel):
         This method is **not** supposed to be called by the user and is prone to be changed in the future.
         """
+        if get_torch_context_manager_or_global_device() == torch.device("meta"):
+            return
         # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
         # correctly load adapter layers for UniSpeech so that we do not have to introduce a new API to
         # [`PreTrainedModel`]. While slightly hacky, UniSpeech never has to tie input and output embeddings, so that it is

transformers/models/unispeech_sat/configuration_unispeech_sat.py CHANGED Viewed

@@ -246,7 +246,10 @@ class UniSpeechSatConfig(PreTrainedConfig):
         num_clusters=504,
         **kwargs,
     ):
-        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.hidden_size = hidden_size
         self.feat_extract_norm = feat_extract_norm
         self.feat_extract_activation = feat_extract_activation

transformers/models/unispeech_sat/modeling_unispeech_sat.py CHANGED Viewed

@@ -44,7 +44,7 @@ from ...modeling_outputs import (
     Wav2Vec2BaseModelOutput,
     XVectorOutput,
 )
-from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel, get_torch_context_manager_or_global_device
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, is_peft_available, logging
 from .configuration_unispeech_sat import UniSpeechSatConfig
@@ -362,9 +362,9 @@ class UniSpeechSatAttention(nn.Module):
         key_states = self.k_proj(current_states).view(*kv_input_shape).transpose(1, 2)
         value_states = self.v_proj(current_states).view(*kv_input_shape).transpose(1, 2)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -1214,6 +1214,9 @@ class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
         This method is **not** supposed to be called by the user and is prone to be changed in the future.
         """
+        if get_torch_context_manager_or_global_device() == torch.device("meta"):
+            return
         # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
         # correctly load adapter layers for UniSpeechSat so that we do not have to introduce a new API to
         # [`PreTrainedModel`]. While slightly hacky, UniSpeechSat never has to tie input and output embeddings, so that it is

transformers/models/upernet/configuration_upernet.py CHANGED Viewed

@@ -13,10 +13,10 @@
 # limitations under the License.
 """UperNet model configuration"""
+from ...backbone_utils import consolidate_backbone_kwargs_to_config
 from ...configuration_utils import PreTrainedConfig
 from ...utils import logging
-from ...utils.backbone_utils import verify_backbone_config_arguments
-from ..auto.configuration_auto import CONFIG_MAPPING, AutoConfig
+from ..auto.configuration_auto import AutoConfig
 logger = logging.get_logger(__name__)
@@ -35,18 +35,6 @@ class UperNetConfig(PreTrainedConfig):
     Args:
         backbone_config (`PreTrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
             The configuration of the backbone model.
-        backbone (`str`, *optional*):
-            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
-            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
-            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
-        use_pretrained_backbone (`bool`, *optional*, `False`):
-            Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, `False`):
-            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
-            library.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         hidden_size (`int`, *optional*, defaults to 512):
             The number of hidden units in the convolutional layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -87,10 +75,6 @@ class UperNetConfig(PreTrainedConfig):
     def __init__(
         self,
         backbone_config=None,
-        backbone=None,
-        use_pretrained_backbone=False,
-        use_timm_backbone=False,
-        backbone_kwargs=None,
         hidden_size=512,
         initializer_range=0.02,
         pool_scales=[1, 2, 3, 6],
@@ -103,27 +87,16 @@ class UperNetConfig(PreTrainedConfig):
         loss_ignore_index=255,
         **kwargs,
     ):
-        if backbone_config is None and backbone is None:
-            logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
-            backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage1", "stage2", "stage3", "stage4"])
-        elif isinstance(backbone_config, dict):
-            backbone_model_type = backbone_config.get("model_type")
-            config_class = CONFIG_MAPPING[backbone_model_type]
-            backbone_config = config_class.from_dict(backbone_config)
-        verify_backbone_config_arguments(
-            use_timm_backbone=use_timm_backbone,
-            use_pretrained_backbone=use_pretrained_backbone,
-            backbone=backbone,
+        backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
             backbone_config=backbone_config,
-            backbone_kwargs=backbone_kwargs,
+            default_config_type="resnet",
+            default_config_kwargs={
+                "out_features": ["stage1", "stage2", "stage3", "stage4"],
+            },
+            **kwargs,
         )
         self.backbone_config = backbone_config
-        self.backbone = backbone
-        self.use_pretrained_backbone = use_pretrained_backbone
-        self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs
         self.hidden_size = hidden_size
         self.initializer_range = initializer_range
         self.pool_scales = pool_scales

transformers/models/upernet/modeling_upernet.py CHANGED Viewed

@@ -17,10 +17,10 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
+from ...backbone_utils import load_backbone
 from ...modeling_outputs import SemanticSegmenterOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring
-from ...utils.backbone_utils import load_backbone
 from .configuration_upernet import UperNetConfig

transformers/models/vaultgemma/configuration_vaultgemma.py CHANGED Viewed

@@ -148,6 +148,10 @@ class VaultGemmaConfig(PreTrainedConfig):
         attn_logit_softcapping: float | None = 50.0,
         **kwargs,
     ):
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
@@ -176,13 +180,7 @@ class VaultGemmaConfig(PreTrainedConfig):
         self.rope_parameters = rope_parameters
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+        super().__init__(**kwargs)
 __all__ = ["VaultGemmaConfig"]

transformers/models/vaultgemma/modeling_vaultgemma.py CHANGED Viewed

@@ -212,9 +212,9 @@ class VaultGemmaAttention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -467,7 +467,7 @@ class VaultGemmaModel(VaultGemmaPreTrainedModel):
 @auto_docstring
 class VaultGemmaForCausalLM(VaultGemmaPreTrainedModel, GenerationMixin):
     _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
-    _tp_plan = {"lm_head": "colwise_rep"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     def __init__(self, config):

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl