PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/models/sam3_tracker/modeling_sam3_tracker.py CHANGED Viewed

@@ -128,6 +128,8 @@ class Sam3TrackerPreTrainedModel(PreTrainedModel):
         if isinstance(module, Sam3TrackerModel):
             if module.no_memory_embedding is not None:
                 init.zeros_(module.no_memory_embedding)
+        elif isinstance(module, Sam3TrackerPositionalEmbedding):
+            init.normal_(module.positional_embedding, std=module.scale)
 class Sam3TrackerPositionalEmbedding(nn.Module):

transformers/models/sam3_tracker/modular_sam3_tracker.py CHANGED Viewed

@@ -149,6 +149,8 @@ class Sam3TrackerPreTrainedModel(Sam2PreTrainedModel):
         if isinstance(module, Sam3TrackerModel):
             if module.no_memory_embedding is not None:
                 init.zeros_(module.no_memory_embedding)
+        elif isinstance(module, Sam3TrackerPositionalEmbedding):
+            init.normal_(module.positional_embedding, std=module.scale)
 class Sam3TrackerPositionalEmbedding(Sam2PositionalEmbedding):

transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py CHANGED Viewed

@@ -397,5 +397,30 @@ class Sam3TrackerVideoConfig(PreTrainedConfig):
         super().__init__(**kwargs)
+    @property
+    def image_size(self):
+        """Image size for the tracker video model."""
+        return self.vision_config.image_size
+    @image_size.setter
+    def image_size(self, value):
+        """Set the image size and propagate to sub-configs. Calculates feature sizes based on patch_size."""
+        self.prompt_encoder_config.image_size = value
+        self.vision_config.image_size = value
+        patch_size = self.vision_config.backbone_config.patch_size
+        self.vision_config.backbone_feature_sizes = [
+            [4 * value // patch_size, 4 * value // patch_size],
+            [2 * value // patch_size, 2 * value // patch_size],
+            [value // patch_size, value // patch_size],
+        ]
+        self.memory_attention_rope_feat_sizes = [
+            value // patch_size,
+            value // patch_size,
+        ]
+        # keep the image_size in the __dict__ to save the value in the config file (backward compatibility)
+        self.__dict__["image_size"] = value
 __all__ = ["Sam3TrackerVideoMaskDecoderConfig", "Sam3TrackerVideoPromptEncoderConfig", "Sam3TrackerVideoConfig"]

transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py CHANGED Viewed

@@ -213,7 +213,7 @@ class Sam3TrackerVideoInferenceSession:
         device_inputs = {}
         for key, value in inputs.items():
             if isinstance(value, torch.Tensor):
-                device_inputs[key] = value.to(self.inference_device, non_blocking=True)
+                device_inputs[key] = value.to(self.inference_device, non_blocking=False)
             else:
                 device_inputs[key] = value
         self.point_inputs_per_obj[obj_idx][frame_idx] = device_inputs
@@ -692,6 +692,12 @@ class Sam3TrackerVideoPreTrainedModel(PreTrainedModel):
         if isinstance(module, Sam3TrackerVideoMemoryFuserCXBlock):
             if module.scale is not None:
                 init.zeros_(module.scale)
+        elif isinstance(module, Sam3TrackerVideoVisionRotaryEmbedding):
+            inv_freq = module.create_inv_freq()
+            init.copy_(module.rope_embeddings_cos, inv_freq.cos())
+            init.copy_(module.rope_embeddings_sin, inv_freq.sin())
+        elif isinstance(module, Sam3TrackerVideoPositionalEmbedding):
+            init.normal_(module.positional_embedding, std=module.scale)
 class Sam3TrackerVideoVisionRotaryEmbedding(nn.Module):
@@ -702,24 +708,17 @@ class Sam3TrackerVideoVisionRotaryEmbedding(nn.Module):
     def __init__(self, config: Sam3TrackerVideoConfig):
         super().__init__()
-        dim = config.memory_attention_hidden_size // (
+        self.dim = config.memory_attention_hidden_size // (
             config.memory_attention_downsample_rate * config.memory_attention_num_attention_heads
         )
         # Ensure even dimension for proper axial splitting
-        if dim % 4 != 0:
+        if self.dim % 4 != 0:
             raise ValueError("Dimension must be divisible by 4 for axial RoPE")
-        end_x, end_y = config.memory_attention_rope_feat_sizes
-        freqs = 1.0 / (config.memory_attention_rope_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+        self.end_x, self.end_y = config.memory_attention_rope_feat_sizes
+        self.memory_attention_rope_theta = config.memory_attention_rope_theta
-        # Generate 2D position indices for axial rotary embedding
-        flattened_indices = torch.arange(end_x * end_y, dtype=torch.long)
-        x_positions = flattened_indices % end_x
-        y_positions = torch.div(flattened_indices, end_x, rounding_mode="floor")
-        freqs_x = torch.outer(x_positions, freqs).float()
-        freqs_y = torch.outer(y_positions, freqs).float()
-        inv_freq = torch.cat([freqs_x, freqs_y], dim=-1)
-        inv_freq = inv_freq.repeat_interleave(2, dim=-1)
         # directly register the cos and sin embeddings as we have a fixed feature shape
+        inv_freq = self.create_inv_freq()
         self.register_buffer("rope_embeddings_cos", inv_freq.cos(), persistent=False)
         self.register_buffer("rope_embeddings_sin", inv_freq.sin(), persistent=False)
@@ -728,6 +727,20 @@ class Sam3TrackerVideoVisionRotaryEmbedding(nn.Module):
         # As the feature map size is fixed, we can just return the pre-computed embeddings.
         return self.rope_embeddings_cos, self.rope_embeddings_sin
+    def create_inv_freq(self):
+        freqs = 1.0 / (
+            self.memory_attention_rope_theta ** (torch.arange(0, self.dim, 4)[: (self.dim // 4)].float() / self.dim)
+        )
+        # Generate 2D position indices for axial rotary embedding
+        flattened_indices = torch.arange(self.end_x * self.end_y, dtype=torch.long)
+        x_positions = flattened_indices % self.end_x
+        y_positions = torch.div(flattened_indices, self.end_x, rounding_mode="floor")
+        freqs_x = torch.outer(x_positions, freqs).float()
+        freqs_y = torch.outer(y_positions, freqs).float()
+        inv_freq = torch.cat([freqs_x, freqs_y], dim=-1)
+        inv_freq = inv_freq.repeat_interleave(2, dim=-1)
+        return inv_freq
 def rotate_pairwise(x):
     """
@@ -1567,8 +1580,6 @@ class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
     input_modalities = ("video", "text")
     _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(Sam3TrackerVideoTwoWayAttentionBlock, index=2)}
     _keys_to_ignore_on_load_unexpected = [r"^detector_model."]
-    _tied_weights_keys = {}
-    _keys_to_ignore_on_load_missing = []
     _checkpoint_conversion_mapping = {
         r"tracker_model.(.+)": r"\1",  # the regex allows to remove the prefix, and add it back in revert mode
         "detector_model.vision_encoder.backbone.": "vision_encoder.backbone.",

transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py CHANGED Viewed

@@ -353,6 +353,31 @@ class Sam3TrackerVideoConfig(PreTrainedConfig):
         super().__init__(**kwargs)
+    @property
+    def image_size(self):
+        """Image size for the tracker video model."""
+        return self.vision_config.image_size
+    @image_size.setter
+    def image_size(self, value):
+        """Set the image size and propagate to sub-configs. Calculates feature sizes based on patch_size."""
+        self.prompt_encoder_config.image_size = value
+        self.vision_config.image_size = value
+        patch_size = self.vision_config.backbone_config.patch_size
+        self.vision_config.backbone_feature_sizes = [
+            [4 * value // patch_size, 4 * value // patch_size],
+            [2 * value // patch_size, 2 * value // patch_size],
+            [value // patch_size, value // patch_size],
+        ]
+        self.memory_attention_rope_feat_sizes = [
+            value // patch_size,
+            value // patch_size,
+        ]
+        # keep the image_size in the __dict__ to save the value in the config file (backward compatibility)
+        self.__dict__["image_size"] = value
 class Sam3TrackerVideoInferenceCache(Sam2VideoInferenceCache):
     pass
@@ -461,8 +486,6 @@ class Sam3TrackerVideoModel(Sam2VideoModel):
         "tracker_neck.": "vision_encoder.neck.",
     }
     _keys_to_ignore_on_load_unexpected = [r"^detector_model."]
-    _tied_weights_keys = {}
-    _keys_to_ignore_on_load_missing = []
     def __init__(self, config: Sam3TrackerVideoConfig, remove_vision_encoder: bool = False):
         r"""

transformers/models/sam3_video/configuration_sam3_video.py CHANGED Viewed

@@ -96,6 +96,9 @@ class Sam3VideoConfig(PreTrainedConfig):
     >>> # Initializing a SAM3 Video configuration with default detector and tracker
     >>> configuration = Sam3VideoConfig()
+    >>> # Changing image size for custom resolution inference (automatically propagates to all nested configs)
+    >>> configuration.image_size = 560
     >>> # Initializing a model from the configuration
     >>> model = Sam3VideoModel(configuration)
@@ -225,5 +228,16 @@ class Sam3VideoConfig(PreTrainedConfig):
         self.high_conf_thresh = high_conf_thresh
         self.high_iou_thresh = high_iou_thresh
+    @property
+    def image_size(self):
+        """Image size for the video model."""
+        return self.detector_config.image_size
+    @image_size.setter
+    def image_size(self, value):
+        """Recursively propagate the image size to detector and tracker configs."""
+        self.detector_config.image_size = value
+        self.tracker_config.image_size = value
 __all__ = ["Sam3VideoConfig"]

transformers/models/sam3_video/modeling_sam3_video.py CHANGED Viewed

@@ -33,7 +33,7 @@ from .configuration_sam3_video import Sam3VideoConfig
 if is_kernels_available():
-    from kernels import get_kernel
+    from ...integrations.hub_kernels import get_kernel
 logger = logging.get_logger(__name__)
@@ -505,8 +505,6 @@ class Sam3VideoPreTrainedModel(PreTrainedModel):
 @auto_docstring
 class Sam3VideoModel(Sam3VideoPreTrainedModel):
-    all_tied_weights_keys = {}
     def __init__(self, config: Sam3VideoConfig):
         super().__init__(config)
         self.config = config
@@ -542,6 +540,8 @@ class Sam3VideoModel(Sam3VideoPreTrainedModel):
         self.tracker_neck = Sam3VisionNeck(config.detector_config.vision_config)
+        self.post_init()
     def get_vision_features_for_tracker(self, vision_embeds: torch.Tensor):
         hidden_states = vision_embeds.last_hidden_state
         batch_size = hidden_states.shape[0]

transformers/models/sam3_video/processing_sam3_video.py CHANGED Viewed

@@ -340,7 +340,7 @@ class Sam3VideoProcessor(ProcessorMixin):
             # slice those valid entries from the original outputs
             keep_idx = torch.nonzero(keep, as_tuple=True)[0]
-            keep_idx_gpu = keep_idx.pin_memory().to(device=out_binary_masks.device, non_blocking=True)
+            keep_idx_gpu = keep_idx.to(device=out_binary_masks.device, non_blocking=True)
             out_obj_ids = torch.index_select(out_obj_ids, 0, keep_idx)
             out_probs = torch.index_select(out_probs, 0, keep_idx)

transformers/models/sam_hq/configuration_sam_hq.py CHANGED Viewed

@@ -188,6 +188,7 @@ class SamHQVisionConfig(PreTrainedConfig):
         self.global_attn_indexes = global_attn_indexes
         self.num_pos_feats = num_pos_feats
         self.mlp_dim = int(hidden_size * mlp_ratio) if mlp_dim is None else mlp_dim
+        self.scale = self.hidden_size // 2
 class SamHQMaskDecoderConfig(PreTrainedConfig):

transformers/models/sam_hq/modeling_sam_hq.py CHANGED Viewed

@@ -413,6 +413,29 @@ class SamHQVisionLayer(GradientCheckpointingLayer):
         return hidden_states
+class SamHQPositionalEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.scale = config.scale
+        self.register_buffer("positional_embedding", self.scale * torch.randn((2, config.num_pos_feats)))
+    def forward(self, input_coords, input_shape=None):
+        """Positionally encode points that are normalized to [0,1]."""
+        coordinates = input_coords.clone()
+        if input_shape is not None:
+            coordinates[:, :, :, 0] = coordinates[:, :, :, 0] / input_shape[1]
+            coordinates[:, :, :, 1] = coordinates[:, :, :, 1] / input_shape[0]
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coordinates = 2 * coordinates - 1
+        coordinates = coordinates.to(self.positional_embedding.dtype)
+        coordinates = coordinates @ self.positional_embedding
+        coordinates = 2 * np.pi * coordinates
+        # outputs d_1 x ... x d_n x channel shape
+        return torch.cat([torch.sin(coordinates), torch.cos(coordinates)], dim=-1)
 @auto_docstring
 class SamHQPreTrainedModel(PreTrainedModel):
     config: SamHQConfig
@@ -433,6 +456,8 @@ class SamHQPreTrainedModel(PreTrainedModel):
         elif isinstance(module, SamHQVisionEncoder):
             if self.config.use_abs_pos:
                 init.zeros_(module.pos_embed)
+        elif isinstance(module, SamHQPositionalEmbedding):
+            init.normal_(module.positional_embedding, std=module.scale)
 class SamHQPatchEmbeddings(nn.Module):
@@ -525,6 +550,7 @@ class SamHQVisionEncoder(SamHQPreTrainedModel):
         self.neck = SamHQVisionNeck(config)
         self.gradient_checkpointing = False
+        self.post_init()
     def get_input_embeddings(self):
         return self.patch_embed
@@ -1069,29 +1095,6 @@ class SamHQVisionModel(SamHQPreTrainedModel):
         return self.vision_encoder(pixel_values, **kwargs)
-class SamHQPositionalEmbedding(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.scale = config.hidden_size // 2
-        self.register_buffer("positional_embedding", self.scale * torch.randn((2, config.num_pos_feats)))
-    def forward(self, input_coords, input_shape=None):
-        """Positionally encode points that are normalized to [0,1]."""
-        coordinates = input_coords.clone()
-        if input_shape is not None:
-            coordinates[:, :, :, 0] = coordinates[:, :, :, 0] / input_shape[1]
-            coordinates[:, :, :, 1] = coordinates[:, :, :, 1] / input_shape[0]
-        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
-        coordinates = 2 * coordinates - 1
-        coordinates = coordinates.to(self.positional_embedding.dtype)
-        coordinates = coordinates @ self.positional_embedding
-        coordinates = 2 * np.pi * coordinates
-        # outputs d_1 x ... x d_n x channel shape
-        return torch.cat([torch.sin(coordinates), torch.cos(coordinates)], dim=-1)
 class SamHQMaskEmbedding(nn.Module):
     def __init__(self, config: SamHQPromptEncoderConfig):
         super().__init__()

transformers/models/seamless_m4t/modeling_seamless_m4t.py CHANGED Viewed

@@ -287,18 +287,17 @@ class SeamlessM4TConformerRelPositionalEmbedding(nn.Module):
         super().__init__()
         self.max_len = config.max_source_positions
         self.d_model = config.hidden_size
-        self.pe = None
-        self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
+        self.register_buffer("pe", self.extend_pe(torch.tensor(0.0).expand(1, self.max_len)), persistent=False)
-    def extend_pe(self, x):
+    def extend_pe(self, x, pe=None):
         # Reset the positional encodings
-        if self.pe is not None:
+        if pe is not None:
             # self.pe contains both positive and negative parts
             # the length of self.pe is 2 * input_len - 1
-            if self.pe.size(1) >= x.size(1) * 2 - 1:
-                if self.pe.dtype != x.dtype or self.pe.device != x.device:
-                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
-                return
+            if pe.size(1) >= x.size(1) * 2 - 1:
+                if pe.dtype != x.dtype or pe.device != x.device:
+                    pe = pe.to(dtype=x.dtype, device=x.device)
+                return pe
         # Suppose `i` is the position of query vector and `j` is the
         # position of key vector. We use positive relative positions when keys
         # are to the left (i>j) and negative relative positions otherwise (i<j).
@@ -319,10 +318,10 @@ class SeamlessM4TConformerRelPositionalEmbedding(nn.Module):
         pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
         pe_negative = pe_negative[1:].unsqueeze(0)
         pe = torch.cat([pe_positive, pe_negative], dim=1)
-        self.pe = pe.to(device=x.device, dtype=x.dtype)
+        return pe.to(device=x.device, dtype=x.dtype)
     def forward(self, hidden_states: torch.Tensor):
-        self.extend_pe(hidden_states)
+        self.pe = self.extend_pe(hidden_states, self.pe)
         start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
         end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
         relative_position_embeddings = self.pe[:, start_idx:end_idx]
@@ -884,13 +883,14 @@ class SeamlessM4TScaledWordEmbedding(nn.Embedding):
         return super().forward(input_ids) * self.embed_scale
-# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding with M2M100->SeamlessM4T
 class SeamlessM4TSinusoidalPositionalEmbedding(nn.Module):
     """This module produces sinusoidal positional embeddings of any length."""
     def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
         super().__init__()
         self.offset = 2
+        self.num_positions = num_positions
         self.embedding_dim = embedding_dim
         self.padding_idx = padding_idx
         self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
@@ -1375,11 +1375,27 @@ class SeamlessM4TPreTrainedModel(PreTrainedModel):
         elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
             init.zeros_(module.bias)
             init.ones_(module.weight)
+            if getattr(module, "running_mean", None) is not None:
+                init.zeros_(module.running_mean)
+                init.ones_(module.running_var)
+                init.zeros_(module.num_batches_tracked)
         elif isinstance(module, nn.Conv1d):
             init.kaiming_normal_(module.weight)
             if module.bias is not None:
                 k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                 init.uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, SeamlessM4TSinusoidalPositionalEmbedding):
+            emb_weights = module.get_embedding(
+                module.num_positions + module.offset, module.embedding_dim, module.padding_idx
+            )
+            init.copy_(module.weights, emb_weights)
+        elif isinstance(module, SeamlessM4TConformerRotaryPositionalEmbedding):
+            dim = self.config.hidden_size // self.config.speech_encoder_attention_heads
+            base = self.config.rotary_embedding_base
+            inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+            init.copy_(module.inv_freq, inv_freq)
+        elif isinstance(module, SeamlessM4TConformerRelPositionalEmbedding):
+            init.copy_(module.pe, module.extend_pe(torch.tensor(0.0).expand(1, module.max_len)))
     def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
         kernel_size, stride = self.config.adaptor_kernel_size, self.config.adaptor_stride

transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py CHANGED Viewed

@@ -762,6 +762,7 @@ class SeamlessM4Tv2SinusoidalPositionalEmbedding(nn.Module):
     def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
         super().__init__()
         self.offset = 2
+        self.num_positions = num_positions
         self.embedding_dim = embedding_dim
         self.padding_idx = padding_idx
         self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
@@ -1292,6 +1293,11 @@ class SeamlessM4Tv2PreTrainedModel(PreTrainedModel):
             if module.bias is not None:
                 k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                 init.uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, SeamlessM4Tv2SinusoidalPositionalEmbedding):
+            emb_weights = module.get_embedding(
+                module.num_positions + module.offset, module.embedding_dim, module.padding_idx
+            )
+            init.copy_(module.weights, emb_weights)
     # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TPreTrainedModel._compute_sub_sample_lengths_from_attention_mask
     def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):

transformers/models/seed_oss/modeling_seed_oss.py CHANGED Viewed

@@ -311,7 +311,7 @@ class SeedOssRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(

transformers/models/segformer/image_processing_segformer_fast.py CHANGED Viewed

@@ -168,7 +168,6 @@ class SegformerImageProcessorFast(BaseImageProcessorFast):
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
         # Stack images into a single tensor if return_tensors is set
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

transformers/models/segformer/modeling_segformer.py CHANGED Viewed

@@ -549,9 +549,9 @@ class SegformerMLP(nn.Module):
         return hidden_states
-class SegformerDecodeHead(SegformerPreTrainedModel):
+class SegformerDecodeHead(nn.Module):
     def __init__(self, config):
-        super().__init__(config)
+        super().__init__()
         # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
         mlps = []
         for i in range(config.num_encoder_blocks):

transformers/models/segformer/modular_segformer.py CHANGED Viewed

@@ -140,7 +140,6 @@ class SegformerImageProcessorFast(BeitImageProcessorFast):
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
         # Stack images into a single tensor if return_tensors is set
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

transformers/models/shieldgemma2/modeling_shieldgemma2.py CHANGED Viewed

@@ -57,6 +57,7 @@ class ShieldGemma2ForImageClassification(PreTrainedModel):
         self.yes_token_index = getattr(config, "yes_token_index", 10_784)
         self.no_token_index = getattr(config, "no_token_index", 3771)
         self.model = AutoModelForImageTextToText.from_config(config=config)
+        self.post_init()
     def get_input_embeddings(self):
         return self.model.language_model.get_input_embeddings()

transformers/models/siglip/modeling_siglip.py CHANGED Viewed

@@ -430,6 +430,8 @@ class SiglipPreTrainedModel(PreTrainedModel):
                 else self.config.hidden_size
             )
             init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+            if hasattr(module, "position_ids"):
+                init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
         elif isinstance(module, nn.Embedding):
             default_flax_embed_init(module.weight)
         elif isinstance(module, SiglipAttention):
@@ -465,6 +467,8 @@ class SiglipPreTrainedModel(PreTrainedModel):
         elif isinstance(module, nn.LayerNorm):
             init.zeros_(module.bias)
             init.ones_(module.weight)
+        elif isinstance(module, SiglipTextEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
 # Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->Siglip
@@ -502,9 +506,11 @@ class SiglipEncoder(nn.Module):
         return BaseModelOutput(last_hidden_state=hidden_states)
-class SiglipTextTransformer(nn.Module):
+class SiglipTextTransformer(SiglipPreTrainedModel):
+    _input_embed_layer = "token_embedding"
     def __init__(self, config: SiglipTextConfig):
-        super().__init__()
+        super().__init__(config)
         self.config = config
         embed_dim = config.hidden_size
         self.embeddings = SiglipTextEmbeddings(config)
@@ -512,6 +518,7 @@ class SiglipTextTransformer(nn.Module):
         self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.head = nn.Linear(embed_dim, config.projection_size)
+        self.post_init()
     @can_return_tuple
     @auto_docstring
@@ -614,6 +621,7 @@ class SiglipTextModel(SiglipPreTrainedModel):
 class SiglipVisionTransformer(SiglipPreTrainedModel):
+    _input_embed_layer = "patch_embedding"
     _can_record_outputs = {
         "hidden_states": SiglipEncoderLayer,
         "attentions": SiglipAttention,
@@ -631,6 +639,8 @@ class SiglipVisionTransformer(SiglipPreTrainedModel):
         if self.use_head:
             self.head = SiglipMultiheadAttentionPoolingHead(config)
+        self.post_init()
     @check_model_inputs(tie_last_hidden_states=False)
     @auto_docstring
     def forward(
@@ -774,6 +784,12 @@ class SiglipModel(SiglipPreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+    def set_input_embeddings(self, value: nn.Module):
+        self.text_model.embeddings.token_embedding = value
     @filter_out_non_signature_kwargs()
     @auto_docstring
     def get_text_features(
@@ -969,6 +985,12 @@ class SiglipForImageClassification(SiglipPreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+    def set_input_embeddings(self, value: nn.Module):
+        self.vision_model.embeddings.patch_embedding = value
     @check_model_inputs
     @auto_docstring
     def forward(

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl