PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/table_transformer/modeling_table_transformer.py CHANGED Viewed

@@ -21,6 +21,7 @@ from torch import Tensor, nn
 from ... import initialization as init
 from ...activations import ACT2FN
+from ...backbone_utils import load_backbone
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
@@ -28,18 +29,11 @@ from ...modeling_utils import PreTrainedModel
 from ...utils import (
     ModelOutput,
     auto_docstring,
-    is_timm_available,
     logging,
-    requires_backends,
 )
-from ...utils.backbone_utils import load_backbone
 from .configuration_table_transformer import TableTransformerConfig
-if is_timm_available():
-    from timm import create_model
 logger = logging.get_logger(__name__)
@@ -196,7 +190,7 @@ def replace_batch_norm(model):
             replace_batch_norm(module)
-# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder with Detr->TableTransformer
+# TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrConvEncoder with Detr->TableTransformer
 class TableTransformerConvEncoder(nn.Module):
     """
     Convolutional backbone, using either the AutoBackbone API or one from the timm library.
@@ -210,47 +204,25 @@ class TableTransformerConvEncoder(nn.Module):
         self.config = config
-        # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
-        if config.use_timm_backbone:
-            # We default to values which were previously hard-coded. This enables configurability from the config
-            # using backbone arguments, while keeping the default behavior the same.
-            requires_backends(self, ["timm"])
-            kwargs = getattr(config, "backbone_kwargs", {})
-            kwargs = {} if kwargs is None else kwargs.copy()
-            out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
-            num_channels = kwargs.pop("in_chans", config.num_channels)
-            if config.dilation:
-                kwargs["output_stride"] = kwargs.get("output_stride", 16)
-            backbone = create_model(
-                config.backbone,
-                pretrained=config.use_pretrained_backbone,
-                features_only=True,
-                out_indices=out_indices,
-                in_chans=num_channels,
-                **kwargs,
-            )
-        else:
-            backbone = load_backbone(config)
+        backbone = load_backbone(config)
+        self.intermediate_channel_sizes = backbone.channels
         # replace batch norm by frozen batch norm
         with torch.no_grad():
             replace_batch_norm(backbone)
-        self.model = backbone
-        self.intermediate_channel_sizes = (
-            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
-        )
-        backbone_model_type = None
-        if config.backbone is not None:
-            backbone_model_type = config.backbone
-        elif config.backbone_config is not None:
-            backbone_model_type = config.backbone_config.model_type
-        else:
-            raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
+        # We used to load with timm library directly instead of the AutoBackbone API
+        # so we need to unwrap the `backbone._backbone` module to load weights without mismatch
+        is_timm_model = False
+        if hasattr(backbone, "_backbone"):
+            backbone = backbone._backbone
+            is_timm_model = True
+        self.model = backbone
+        backbone_model_type = config.backbone_config.model_type
         if "resnet" in backbone_model_type:
             for name, parameter in self.model.named_parameters():
-                if config.use_timm_backbone:
+                if is_timm_model:
                     if "layer2" not in name and "layer3" not in name and "layer4" not in name:
                         parameter.requires_grad_(False)
                 else:
@@ -259,7 +231,9 @@ class TableTransformerConvEncoder(nn.Module):
     def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
         # send pixel_values through the model to get list of feature maps
-        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+        features = self.model(pixel_values)
+        if isinstance(features, dict):
+            features = features.feature_maps
         out = []
         for feature_map in features:
@@ -269,7 +243,7 @@ class TableTransformerConvEncoder(nn.Module):
         return out
-# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->TableTransformer
+# TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->TableTransformer
 class TableTransformerConvModel(nn.Module):
     """
     This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
@@ -291,7 +265,7 @@ class TableTransformerConvModel(nn.Module):
         return out, pos
-# Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->TableTransformer
+# TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrSinePositionEmbedding with Detr->TableTransformer
 class TableTransformerSinePositionEmbedding(nn.Module):
     """
     This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
@@ -329,7 +303,7 @@ class TableTransformerSinePositionEmbedding(nn.Module):
         return pos
-# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->TableTransformer
+# TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding with Detr->TableTransformer
 class TableTransformerLearnedPositionEmbedding(nn.Module):
     """
     This module learns positional embeddings up to a fixed maximum size.
@@ -353,7 +327,7 @@ class TableTransformerLearnedPositionEmbedding(nn.Module):
         return pos
-# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->TableTransformer
+# TODO: use modular - Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->TableTransformer
 def build_position_encoding(config):
     n_steps = config.d_model // 2
     if config.position_embedding_type == "sine":
@@ -367,7 +341,7 @@ def build_position_encoding(config):
     return position_embedding
-# Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->TABLE_TRANSFORMER,Detr->TableTransformer
+# TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrAttention with DETR->TABLE_TRANSFORMER,Detr->TableTransformer
 class TableTransformerAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
@@ -502,7 +476,7 @@ class TableTransformerAttention(nn.Module):
 class TableTransformerEncoderLayer(nn.Module):
-    # Copied from transformers.models.detr.modeling_detr.DetrEncoderLayer.__init__ with Detr->TableTransformer
+    # TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrEncoderLayer.__init__ with Detr->TableTransformer
     def __init__(self, config: TableTransformerConfig):
         super().__init__()
         self.embed_dim = config.d_model
@@ -575,7 +549,7 @@ class TableTransformerEncoderLayer(nn.Module):
 class TableTransformerDecoderLayer(GradientCheckpointingLayer):
-    # Copied from transformers.models.detr.modeling_detr.DetrDecoderLayer.__init__ with Detr->TableTransformer
+    # TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrDecoderLayer.__init__ with Detr->TableTransformer
     def __init__(self, config: TableTransformerConfig):
         super().__init__()
         self.embed_dim = config.d_model
@@ -828,7 +802,7 @@ class TableTransformerEncoder(TableTransformerPreTrainedModel):
         )
-# Copied from transformers.models.detr.modeling_detr.DetrDecoder with DETR->TABLE_TRANSFORMER,Detr->TableTransformer
+# TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrDecoder with DETR->TABLE_TRANSFORMER,Detr->TableTransformer
 class TableTransformerDecoder(TableTransformerPreTrainedModel):
     """
     Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TableTransformerDecoderLayer`].
@@ -1003,7 +977,7 @@ class TableTransformerDecoder(TableTransformerPreTrainedModel):
     """
 )
 class TableTransformerModel(TableTransformerPreTrainedModel):
-    # Copied from transformers.models.detr.modeling_detr.DetrModel.__init__ with Detr->TableTransformer
+    # TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrModel.__init__ with Detr->TableTransformer
     def __init__(self, config: TableTransformerConfig):
         super().__init__(config)
@@ -1172,7 +1146,7 @@ class TableTransformerModel(TableTransformerPreTrainedModel):
     """
 )
 class TableTransformerForObjectDetection(TableTransformerPreTrainedModel):
-    # Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection.__init__ with Detr->TableTransformer
+    # TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrForObjectDetection.__init__ with Detr->TableTransformer
     def __init__(self, config: TableTransformerConfig):
         super().__init__(config)
@@ -1306,7 +1280,7 @@ class TableTransformerForObjectDetection(TableTransformerPreTrainedModel):
         )
-# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->TableTransformer,detr->table_transformer
+# TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->TableTransformer,detr->table_transformer
 class TableTransformerMLPPredictionHead(nn.Module):
     """
     Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,

transformers/models/tapas/configuration_tapas.py CHANGED Viewed

@@ -151,6 +151,8 @@ class TapasConfig(PreTrainedConfig):
         initializer_range=0.02,
         layer_norm_eps=1e-12,
         pad_token_id=0,
+        bos_token_id=None,
+        eos_token_id=None,
         positive_label_weight=10.0,
         num_aggregation_labels=0,
         aggregation_loss_weight=1.0,
@@ -175,11 +177,20 @@ class TapasConfig(PreTrainedConfig):
         disable_per_token_loss=False,
         aggregation_labels=None,
         no_aggregation_label_index=None,
+        is_decoder=False,
+        add_cross_attention=False,
+        tie_word_embeddings=True,
         **kwargs,
     ):
-        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        super().__init__(**kwargs)
         # BERT hyperparameters (with updated max_position_embeddings and type_vocab_sizes)
+        self.is_decoder = is_decoder
+        self.add_cross_attention = add_cross_attention
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers

transformers/models/tapas/modeling_tapas.py CHANGED Viewed

@@ -1491,7 +1491,7 @@ def _segment_reduce(values, index, segment_reduce_fn, name):
     new_shape = torch.cat(
         [
             torch.as_tensor(index.batch_shape(), dtype=torch.long, device=device),
-            torch.as_tensor([index.num_segments], dtype=torch.long, device=device),
+            torch.as_tensor(index.num_segments, dtype=torch.long, device=device).unsqueeze(dim=0),
             torch.as_tensor(vector_shape, dtype=torch.long, device=device),
         ],
         dim=0,

transformers/models/tapas/tokenization_tapas.py CHANGED Viewed

@@ -229,6 +229,7 @@ class TapasTokenizer(PreTrainedTokenizer):
             extra spaces.
     """
+    model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
     vocab_files_names = VOCAB_FILES_NAMES
     def __init__(

transformers/models/textnet/configuration_textnet.py CHANGED Viewed

@@ -13,9 +13,9 @@
 # limitations under the License.
 """TextNet model configuration"""
-from transformers import PreTrainedConfig
-from transformers.utils import logging
-from transformers.utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+from ...backbone_utils import BackboneConfigMixin
+from ...configuration_utils import PreTrainedConfig
+from ...utils import logging
 logger = logging.get_logger(__name__)
@@ -126,9 +126,7 @@ class TextNetConfig(BackboneConfigMixin, PreTrainedConfig):
         self.depths = [len(layer) for layer in self.conv_layer_kernel_sizes]
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, 5)]
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
-        )
+        self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features)
 __all__ = ["TextNetConfig"]

transformers/models/textnet/image_processing_textnet_fast.py CHANGED Viewed

@@ -16,7 +16,7 @@
 from typing import Optional
 import torch
-from torchvision.transforms.v2 import functional as F
+import torchvision.transforms.v2.functional as tvF
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import BaseImageProcessorFast
@@ -68,7 +68,7 @@ class TextNetImageProcessorFast(BaseImageProcessorFast):
         self,
         image: "torch.Tensor",
         size: SizeDict,
-        interpolation: Optional["F.InterpolationMode"] = None,
+        interpolation: Optional["tvF.InterpolationMode"] = None,
         antialias: bool = True,
         size_divisor: int = 32,
         **kwargs,
@@ -99,7 +99,7 @@ class TextNetImageProcessorFast(BaseImageProcessorFast):
         do_resize: bool,
         size: SizeDict,
         size_divisor: int,
-        interpolation: Optional["F.InterpolationMode"],
+        interpolation: Optional["tvF.InterpolationMode"],
         do_center_crop: bool,
         crop_size: SizeDict,
         do_rescale: bool,

transformers/models/textnet/modeling_textnet.py CHANGED Viewed

@@ -19,19 +19,17 @@ import torch
 import torch.nn as nn
 from torch import Tensor
-from transformers import PreTrainedModel
-from transformers.activations import ACT2CLS
-from transformers.modeling_outputs import (
+from ...activations import ACT2CLS
+from ...backbone_utils import BackboneMixin
+from ...modeling_outputs import (
     BackboneOutput,
     BaseModelOutputWithNoAttention,
     BaseModelOutputWithPoolingAndNoAttention,
     ImageClassifierOutputWithNoAttention,
 )
-from transformers.models.textnet.configuration_textnet import TextNetConfig
-from transformers.utils import logging
-from transformers.utils.backbone_utils import BackboneMixin
-from ...utils import auto_docstring
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from .configuration_textnet import TextNetConfig
 logger = logging.get_logger(__name__)
@@ -302,12 +300,14 @@ class TextNetForImageClassification(TextNetPreTrainedModel):
         Examples:
         ```python
         >>> import torch
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> from transformers import TextNetForImageClassification, TextNetImageProcessor
         >>> from PIL import Image
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> processor = TextNetImageProcessor.from_pretrained("czczup/textnet-base")
         >>> model = TextNetForImageClassification.from_pretrained("czczup/textnet-base")
@@ -342,12 +342,11 @@ class TextNetForImageClassification(TextNetPreTrainedModel):
     TextNet backbone, to be used with frameworks like DETR and MaskFormer.
     """
 )
-class TextNetBackbone(TextNetPreTrainedModel, BackboneMixin):
+class TextNetBackbone(BackboneMixin, TextNetPreTrainedModel):
     has_attentions = False
     def __init__(self, config):
         super().__init__(config)
-        super()._init_backbone(config)
         self.textnet = TextNetModel(config)
         self.num_features = config.hidden_sizes
@@ -368,12 +367,14 @@ class TextNetBackbone(TextNetPreTrainedModel, BackboneMixin):
         ```python
         >>> import torch
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> from PIL import Image
         >>> from transformers import AutoImageProcessor, AutoBackbone
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> processor = AutoImageProcessor.from_pretrained("czczup/textnet-base")
         >>> model = AutoBackbone.from_pretrained("czczup/textnet-base")

transformers/models/time_series_transformer/modeling_time_series_transformer.py CHANGED Viewed

@@ -402,9 +402,9 @@ class TimeSeriesTransformerAttention(nn.Module):
                 if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
                     past_key_values.is_updated[self.layer_idx] = True
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,

transformers/models/timesfm/modeling_timesfm.py CHANGED Viewed

@@ -245,9 +245,9 @@ class TimesFmAttention(nn.Module):
         key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
         value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        attention_interface: Callable = simple_eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, simple_eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -620,7 +620,7 @@ class TimesFmModelForPrediction(TimesFmPreTrainedModel):
         - the number of padded examples for SPMD so that each core has the same
             number (a multiple of `batch_size`) of examples.
         """
-        input_ts, input_padding, inp_freq = [], [], []
+        input_ts, input_padding = [], []
         for i, ts in enumerate(inputs):
             input_len = ts.shape[0]
@@ -635,12 +635,11 @@ class TimesFmModelForPrediction(TimesFmPreTrainedModel):
             input_ts.append(ts)
             input_padding.append(padding)
-            inp_freq.append(freq[i])
         return (
             torch.stack(input_ts, dim=0),
             torch.stack(input_padding, dim=0),
-            torch.tensor(inp_freq, dtype=torch.int32).reshape(-1, 1),
+            torch.tensor(freq[: len(inputs)], dtype=torch.int32).reshape(-1, 1),
         )
     def _postprocess_output(

transformers/models/timesfm/modular_timesfm.py CHANGED Viewed

@@ -201,9 +201,9 @@ class TimesFmAttention(nn.Module):
         key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
         value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        attention_interface: Callable = simple_eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, simple_eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -576,7 +576,7 @@ class TimesFmModelForPrediction(TimesFmPreTrainedModel):
         - the number of padded examples for SPMD so that each core has the same
             number (a multiple of `batch_size`) of examples.
         """
-        input_ts, input_padding, inp_freq = [], [], []
+        input_ts, input_padding = [], []
         for i, ts in enumerate(inputs):
             input_len = ts.shape[0]
@@ -591,12 +591,11 @@ class TimesFmModelForPrediction(TimesFmPreTrainedModel):
             input_ts.append(ts)
             input_padding.append(padding)
-            inp_freq.append(freq[i])
         return (
             torch.stack(input_ts, dim=0),
             torch.stack(input_padding, dim=0),
-            torch.tensor(inp_freq, dtype=torch.int32).reshape(-1, 1),
+            torch.tensor(freq[: len(inputs)], dtype=torch.int32).reshape(-1, 1),
         )
     def _postprocess_output(

transformers/models/timm_backbone/configuration_timm_backbone.py CHANGED Viewed

@@ -14,6 +14,7 @@
 """Configuration for Backbone models"""
+from ...backbone_utils import BackboneConfigMixin
 from ...configuration_utils import PreTrainedConfig
 from ...utils import logging
@@ -21,7 +22,7 @@ from ...utils import logging
 logger = logging.get_logger(__name__)
-class TimmBackboneConfig(PreTrainedConfig):
+class TimmBackboneConfig(BackboneConfigMixin, PreTrainedConfig):
     r"""
     This is the configuration class to store the configuration for a timm backbone [`TimmBackbone`].
@@ -37,8 +38,6 @@ class TimmBackboneConfig(PreTrainedConfig):
             The number of input channels.
         features_only (`bool`, *optional*, defaults to `True`):
             Whether to output only the features or also the logits.
-        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use a pretrained backbone.
         out_indices (`list[int]`, *optional*):
             If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
             many stages the model has). Will default to the last stage if unset.
@@ -67,19 +66,46 @@ class TimmBackboneConfig(PreTrainedConfig):
         backbone=None,
         num_channels=3,
         features_only=True,
-        use_pretrained_backbone=True,
         out_indices=None,
         freeze_batch_norm_2d=False,
+        output_stride=None,
         **kwargs,
     ):
-        super().__init__(**kwargs)
         self.backbone = backbone
         self.num_channels = num_channels
         self.features_only = features_only
-        self.use_pretrained_backbone = use_pretrained_backbone
-        self.use_timm_backbone = True
         self.out_indices = out_indices if out_indices is not None else [-1]
+        self.output_stride = output_stride
         self.freeze_batch_norm_2d = freeze_batch_norm_2d
+        # self._out_features = kwargs.pop("out_features", None)
+        super().__init__(**kwargs)
+    @property
+    def out_indices(self):
+        return self._out_indices
+    @out_indices.setter
+    def out_indices(self, out_indices: tuple[int, ...] | list[int]):
+        """
+        Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
+        """
+        self._out_indices = list(out_indices) if out_indices is not None else out_indices
+        if getattr(self, "stage_names", None) is not None:
+            self.set_output_features_output_indices(out_features=None, out_indices=out_indices)
+    @property
+    def out_features(self):
+        return self._out_features
+    @out_features.setter
+    def out_features(self, out_features: list[str]):
+        """
+        Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
+        """
+        self._out_features = out_features
+        if getattr(self, "stage_names", None) is not None:
+            self.set_output_features_output_indices(out_features=out_features, out_indices=None)
 __all__ = ["TimmBackboneConfig"]

transformers/models/timm_backbone/modeling_timm_backbone.py CHANGED Viewed

@@ -17,10 +17,10 @@ import torch
 from torch import Tensor, nn
 from ... import initialization as init
+from ...backbone_utils import BackboneMixin
 from ...modeling_outputs import BackboneOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import is_timm_available, requires_backends
-from ...utils.backbone_utils import BackboneMixin
 from .configuration_timm_backbone import TimmBackboneConfig
@@ -28,7 +28,7 @@ if is_timm_available():
     import timm
-class TimmBackbone(PreTrainedModel, BackboneMixin):
+class TimmBackbone(BackboneMixin, PreTrainedModel):
     """
     Wrapper class for timm models to be used as backbones. This enables using the timm models interchangeably with the
     other models in the library keeping the same API.
@@ -41,8 +41,6 @@ class TimmBackbone(PreTrainedModel, BackboneMixin):
     def __init__(self, config, **kwargs):
         requires_backends(self, "timm")
-        super().__init__(config)
-        self.config = config
         if config.backbone is None:
             raise ValueError("backbone is not set in the config. Please set it to a timm model name.")
@@ -50,25 +48,29 @@ class TimmBackbone(PreTrainedModel, BackboneMixin):
         if hasattr(config, "out_features") and config.out_features is not None:
             raise ValueError("out_features is not supported by TimmBackbone. Please use out_indices instead.")
-        pretrained = getattr(config, "use_pretrained_backbone", None)
-        if pretrained is None:
-            raise ValueError("use_pretrained_backbone is not set in the config. Please set it to True or False.")
         # We just take the final layer by default. This matches the default for the transformers models.
         out_indices = config.out_indices if getattr(config, "out_indices", None) is not None else (-1,)
+        pretrained = kwargs.pop("pretrained", False)
         in_chans = kwargs.pop("in_chans", config.num_channels)
-        self._backbone = timm.create_model(
+        backbone = timm.create_model(
             config.backbone,
             pretrained=pretrained,
             # This is currently not possible for transformer architectures.
             features_only=config.features_only,
             in_chans=in_chans,
             out_indices=out_indices,
+            output_stride=config.output_stride,
             **kwargs,
         )
-        # Converts all `BatchNorm2d` and `SyncBatchNorm` or `BatchNormAct2d` and `SyncBatchNormAct2d` layers of provided module into `FrozenBatchNorm2d` or `FrozenBatchNormAct2d` respectively
+        # Needs to be called after creating timm model, because `super()` will try to infer
+        # `stage_names` from model architecture
+        super().__init__(config, timm_backbone=backbone)
+        self._backbone = backbone
+        # Converts all `BatchNorm2d` and `SyncBatchNorm` or `BatchNormAct2d` and `SyncBatchNormAct2d` layers of
+        # provided module into `FrozenBatchNorm2d` or `FrozenBatchNormAct2d` respectively
         if getattr(config, "freeze_batch_norm_2d", False):
             self.freeze_batch_norm_2d()
@@ -78,7 +80,6 @@ class TimmBackbone(PreTrainedModel, BackboneMixin):
             layer["module"]: str(layer["index"]) for layer in self._backbone.feature_info.get_dicts()
         }
         self._all_layers = {layer["module"]: str(i) for i, layer in enumerate(self._backbone.feature_info.info)}
-        super()._init_backbone(config)
         self.post_init()
@@ -87,23 +88,16 @@ class TimmBackbone(PreTrainedModel, BackboneMixin):
         requires_backends(cls, ["vision", "timm"])
         config = kwargs.pop("config", TimmBackboneConfig())
-        use_timm = kwargs.pop("use_timm_backbone", True)
-        if not use_timm:
-            raise ValueError("use_timm_backbone must be True for timm backbones")
         num_channels = kwargs.pop("num_channels", config.num_channels)
         features_only = kwargs.pop("features_only", config.features_only)
-        use_pretrained_backbone = kwargs.pop("use_pretrained_backbone", config.use_pretrained_backbone)
         out_indices = kwargs.pop("out_indices", config.out_indices)
         config = TimmBackboneConfig(
             backbone=pretrained_model_name_or_path,
             num_channels=num_channels,
             features_only=features_only,
-            use_pretrained_backbone=use_pretrained_backbone,
             out_indices=out_indices,
         )
-        return super()._from_config(config, **kwargs)
+        return super()._from_config(config, pretrained=True, **kwargs)
     def freeze_batch_norm_2d(self):
         timm.utils.model.freeze_batch_norm_2d(self._backbone)
@@ -117,10 +111,13 @@ class TimmBackbone(PreTrainedModel, BackboneMixin):
         assume weights and persistent buffers will be part of checkpoint as we have no way to control timm inits)"""
         if hasattr(module, "init_non_persistent_buffers"):
             module.init_non_persistent_buffers()
-        elif isinstance(module, nn.BatchNorm2d) and getattr(module, "running_mean", None) is not None:
-            init.zeros_(module.running_mean)
-            init.ones_(module.running_var)
-            init.zeros_(module.num_batches_tracked)
+        elif isinstance(module, nn.BatchNorm2d):
+            # For non-pretrained models, always initialize buffers (handles both meta device and to_empty() cases)
+            running_mean = getattr(module, "running_mean", None)
+            if running_mean is not None:
+                init.zeros_(module.running_mean)
+                init.ones_(module.running_var)
+                init.zeros_(module.num_batches_tracked)
     def forward(
         self,

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl