PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/models/resnet/modeling_resnet.py CHANGED Viewed

@@ -262,9 +262,14 @@ class ResNetPreTrainedModel(PreTrainedModel):
                 fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(module.weight)
                 bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
                 init.uniform_(module.bias, -bound, bound)
-        elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
-            init.constant_(module.weight, 1)
-            init.constant_(module.bias, 0)
+        # We need to check it like that as some Detr models replace the BatchNorm2d by their own
+        elif "BatchNorm" in module.__class__.__name__:
+            init.ones_(module.weight)
+            init.zeros_(module.bias)
+            init.zeros_(module.running_mean)
+            init.ones_(module.running_var)
+            if getattr(module, "num_batches_tracked", None) is not None:
+                init.zeros_(module.num_batches_tracked)
 @auto_docstring

transformers/models/roberta/modeling_roberta.py CHANGED Viewed

@@ -501,6 +501,9 @@ class RobertaPreTrainedModel(PreTrainedModel):
         super()._init_weights(module)
         if isinstance(module, RobertaLMHead):
             init.zeros_(module.bias)
+        elif isinstance(module, RobertaEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+            init.zeros_(module.token_type_ids)
 class RobertaEncoder(nn.Module):

transformers/models/roberta/modular_roberta.py CHANGED Viewed

@@ -172,6 +172,9 @@ class RobertaPreTrainedModel(PreTrainedModel):
         super()._init_weights(module)
         if isinstance(module, RobertaLMHead):
             init.zeros_(module.bias)
+        elif isinstance(module, RobertaEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+            init.zeros_(module.token_type_ids)
 class RobertaModel(BertModel):

transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py CHANGED Viewed

@@ -561,6 +561,9 @@ class RobertaPreLayerNormPreTrainedModel(PreTrainedModel):
         super()._init_weights(module)
         if isinstance(module, RobertaPreLayerNormLMHead):
             init.zeros_(module.bias)
+        elif isinstance(module, RobertaPreLayerNormEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+            init.zeros_(module.token_type_ids)
 @auto_docstring(

transformers/models/roc_bert/modeling_roc_bert.py CHANGED Viewed

@@ -621,6 +621,9 @@ class RoCBertPreTrainedModel(PreTrainedModel):
         super()._init_weights(module)
         if isinstance(module, RoCBertLMPredictionHead):
             init.zeros_(module.bias)
+        elif isinstance(module, RoCBertEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+            init.zeros_(module.token_type_ids)
 @auto_docstring(

transformers/models/rt_detr/configuration_rt_detr.py CHANGED Viewed

@@ -44,7 +44,7 @@ class RTDetrConfig(PreTrainedConfig):
             The epsilon used by the layer normalization layers.
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the batch normalization layers.
-        backbone_config (`Dict`, *optional*, defaults to `RTDetrResNetConfig()`):
+        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `RTDetrResNetConfig()`):
             The configuration of the backbone model.
         backbone (`str`, *optional*):
             Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this

transformers/models/rt_detr/modeling_rt_detr.py CHANGED Viewed

@@ -1059,6 +1059,10 @@ class RTDetrPreTrainedModel(PreTrainedModel):
             init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
                 init.zeros_(module.bias)
+            if getattr(module, "running_mean", None) is not None:
+                init.zeros_(module.running_mean)
+                init.ones_(module.running_var)
+                init.zeros_(module.num_batches_tracked)
         elif isinstance(module, nn.LayerNorm):
             init.ones_(module.weight)

transformers/models/rt_detr/modeling_rt_detr_resnet.py CHANGED Viewed

@@ -316,9 +316,14 @@ class RTDetrResNetPreTrainedModel(PreTrainedModel):
                 fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(module.weight)
                 bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
                 init.uniform_(module.bias, -bound, bound)
-        elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
-            init.constant_(module.weight, 1)
-            init.constant_(module.bias, 0)
+        # We need to check it like that as some Detr models replace the BatchNorm2d by their own
+        elif "BatchNorm" in module.__class__.__name__:
+            init.ones_(module.weight)
+            init.zeros_(module.bias)
+            init.zeros_(module.running_mean)
+            init.ones_(module.running_var)
+            if getattr(module, "num_batches_tracked", None) is not None:
+                init.zeros_(module.num_batches_tracked)
 @auto_docstring(

transformers/models/rt_detr_v2/configuration_rt_detr_v2.py CHANGED Viewed

@@ -18,7 +18,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from ...configuration_utils import PreTrainedConfig
 from ...utils import logging
 from ...utils.backbone_utils import verify_backbone_config_arguments
@@ -49,7 +48,7 @@ class RTDetrV2Config(PreTrainedConfig):
             The epsilon used by the layer normalization layers.
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the batch normalization layers.
-        backbone_config (`Dict`, *optional*, defaults to `RTDetrV2ResNetConfig()`):
+        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `RTDetrV2ResNetConfig()`):
             The configuration of the backbone model.
         backbone (`str`, *optional*):
             Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
@@ -357,8 +356,8 @@ class RTDetrV2Config(PreTrainedConfig):
         self.decoder_n_levels = decoder_n_levels
         self.decoder_offset_scale = decoder_offset_scale
         self.decoder_method = decoder_method
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
-        self.tie_encoder_decoder = True
 __all__ = ["RTDetrV2Config"]

transformers/models/rt_detr_v2/modeling_rt_detr_v2.py CHANGED Viewed

@@ -506,6 +506,10 @@ class RTDetrV2PreTrainedModel(PreTrainedModel):
             init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
                 init.zeros_(module.bias)
+            if getattr(module, "running_mean", None) is not None:
+                init.zeros_(module.running_mean)
+                init.ones_(module.running_var)
+                init.zeros_(module.num_batches_tracked)
         elif isinstance(module, nn.LayerNorm):
             init.ones_(module.weight)
@@ -515,6 +519,9 @@ class RTDetrV2PreTrainedModel(PreTrainedModel):
             init.xavier_uniform_(module.weight_embedding.weight)
         if hasattr(module, "denoising_class_embed") and self.config.num_denoising > 0:
             init.xavier_uniform_(module.denoising_class_embed.weight)
+        if isinstance(module, RTDetrV2MultiscaleDeformableAttention):
+            n_points_scale = [1 / n for n in module.n_points_list for _ in range(n)]
+            init.copy_(module.n_points_scale, torch.tensor(n_points_scale, dtype=torch.float32))
 @dataclass

transformers/models/rt_detr_v2/modular_rt_detr_v2.py CHANGED Viewed

@@ -19,6 +19,7 @@ import torch
 import torch.nn.functional as F
 from torch import Tensor, nn
+from ... import initialization as init
 from ...configuration_utils import PreTrainedConfig
 from ...utils import is_torchdynamo_compiling, logging
 from ...utils.backbone_utils import (
@@ -59,7 +60,7 @@ class RTDetrV2Config(PreTrainedConfig):
             The epsilon used by the layer normalization layers.
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the batch normalization layers.
-        backbone_config (`Dict`, *optional*, defaults to `RTDetrV2ResNetConfig()`):
+        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `RTDetrV2ResNetConfig()`):
             The configuration of the backbone model.
         backbone (`str`, *optional*):
             Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
@@ -367,8 +368,8 @@ class RTDetrV2Config(PreTrainedConfig):
         self.decoder_n_levels = decoder_n_levels
         self.decoder_offset_scale = decoder_offset_scale
         self.decoder_method = decoder_method
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
-        self.tie_encoder_decoder = True
 def multi_scale_deformable_attention_v2(
@@ -564,7 +565,11 @@ class RTDetrV2DecoderLayer(RTDetrDecoderLayer):
 class RTDetrV2PreTrainedModel(RTDetrPreTrainedModel):
-    pass
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, RTDetrV2MultiscaleDeformableAttention):
+            n_points_scale = [1 / n for n in module.n_points_list for _ in range(n)]
+            init.copy_(module.n_points_scale, torch.tensor(n_points_scale, dtype=torch.float32))
 class RTDetrV2Decoder(RTDetrDecoder):

transformers/models/rwkv/modeling_rwkv.py CHANGED Viewed

@@ -49,7 +49,7 @@ def load_wkv_cuda_kernel(context_length):
     if not is_kernels_available():
         raise ImportError("kernels is not installed, please install it with `pip install kernels`")
-    from kernels import get_kernel
+    from ...integrations.hub_kernels import get_kernel
     rwkv_cuda_kernel = get_kernel("kernels-community/rwkv")
     rwkv_cuda_kernel.max_seq_length = context_length

transformers/models/sam/configuration_sam.py CHANGED Viewed

@@ -249,6 +249,7 @@ class SamVisionConfig(PreTrainedConfig):
         self.global_attn_indexes = global_attn_indexes
         self.num_pos_feats = num_pos_feats
         self.mlp_dim = int(hidden_size * mlp_ratio) if mlp_dim is None else mlp_dim
+        self.scale = self.hidden_size // 2
 class SamConfig(PreTrainedConfig):

transformers/models/sam/image_processing_sam_fast.py CHANGED Viewed

@@ -267,7 +267,6 @@ class SamImageProcessorFast(BaseImageProcessorFast):
         if do_pad:
             processed_images = self.pad(processed_images, pad_size=pad_size, disable_grouping=disable_grouping)
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         return BatchFeature(
             data={"pixel_values": processed_images, "reshaped_input_sizes": reshaped_input_sizes},
             tensor_type=return_tensors,

transformers/models/sam/modeling_sam.py CHANGED Viewed

@@ -548,7 +548,7 @@ class SamMaskDecoder(nn.Module):
 class SamPositionalEmbedding(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.scale = config.hidden_size // 2
+        self.scale = config.scale
         self.register_buffer("positional_embedding", self.scale * torch.randn((2, config.num_pos_feats)))
     def forward(self, input_coords, input_shape=None):
@@ -1014,6 +1014,8 @@ class SamPreTrainedModel(PreTrainedModel):
         elif isinstance(module, SamVisionEncoder):
             if self.config.use_abs_pos:
                 init.zeros_(module.pos_embed)
+        elif isinstance(module, SamPositionalEmbedding):
+            init.normal_(module.positional_embedding, std=module.scale)
 class SamVisionEncoder(SamPreTrainedModel):
@@ -1048,6 +1050,7 @@ class SamVisionEncoder(SamPreTrainedModel):
         self.neck = SamVisionNeck(config)
         self.gradient_checkpointing = False
+        self.post_init()
     def get_input_embeddings(self):
         return self.patch_embed

transformers/models/sam2/configuration_sam2.py CHANGED Viewed

@@ -152,7 +152,7 @@ class Sam2VisionConfig(PreTrainedConfig):
     documentation from [`PreTrainedConfig`] for more information.
     Args:
-        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*):
+        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `Sam2HieraDetConfig()`):
             Configuration for the vision backbone. This is used to instantiate the backbone using
             `AutoModel.from_config`.
         backbone_channel_list (`List[int]`, *optional*, defaults to `[768, 384, 192, 96]`):

transformers/models/sam2/modeling_sam2.py CHANGED Viewed

@@ -565,7 +565,9 @@ class Sam2PreTrainedModel(PreTrainedModel):
                 init.zeros_(module.pos_embed)
             if module.pos_embed_window is not None:
                 init.zeros_(module.pos_embed_window)
-        if isinstance(module, Sam2Model):
+        elif isinstance(module, Sam2PositionalEmbedding):
+            init.normal_(module.positional_embedding, std=module.scale)
+        elif isinstance(module, Sam2Model):
             if module.no_memory_embedding is not None:
                 init.zeros_(module.no_memory_embedding)
@@ -600,6 +602,8 @@ class Sam2HieraDetModel(Sam2PreTrainedModel):
                 self.blocks.append(block)
                 total_block_idx += 1
+        self.post_init()
     def get_input_embeddings(self):
         return self.patch_embed

transformers/models/sam2/modular_sam2.py CHANGED Viewed

@@ -681,7 +681,9 @@ class Sam2PreTrainedModel(PreTrainedModel):
                 init.zeros_(module.pos_embed)
             if module.pos_embed_window is not None:
                 init.zeros_(module.pos_embed_window)
-        if isinstance(module, Sam2Model):
+        elif isinstance(module, Sam2PositionalEmbedding):
+            init.normal_(module.positional_embedding, std=module.scale)
+        elif isinstance(module, Sam2Model):
             if module.no_memory_embedding is not None:
                 init.zeros_(module.no_memory_embedding)
@@ -716,6 +718,8 @@ class Sam2HieraDetModel(Sam2PreTrainedModel):
                 self.blocks.append(block)
                 total_block_idx += 1
+        self.post_init()
     def get_input_embeddings(self):
         return self.patch_embed

transformers/models/sam2_video/modeling_sam2_video.py CHANGED Viewed

@@ -209,7 +209,7 @@ class Sam2VideoInferenceSession:
         device_inputs = {}
         for key, value in inputs.items():
             if isinstance(value, torch.Tensor):
-                device_inputs[key] = value.to(self.inference_device, non_blocking=True)
+                device_inputs[key] = value.to(self.inference_device, non_blocking=False)
             else:
                 device_inputs[key] = value
         self.point_inputs_per_obj[obj_idx][frame_idx] = device_inputs
@@ -688,6 +688,12 @@ class Sam2VideoPreTrainedModel(PreTrainedModel):
         if isinstance(module, Sam2VideoMemoryFuserCXBlock):
             if module.scale is not None:
                 init.zeros_(module.scale)
+        elif isinstance(module, Sam2VideoVisionRotaryEmbedding):
+            inv_freq = module.create_inv_freq()
+            init.copy_(module.rope_embeddings_cos, inv_freq.cos())
+            init.copy_(module.rope_embeddings_sin, inv_freq.sin())
+        elif isinstance(module, Sam2VideoPositionalEmbedding):
+            init.normal_(module.positional_embedding, std=module.scale)
 class Sam2VideoVisionRotaryEmbedding(nn.Module):
@@ -698,24 +704,17 @@ class Sam2VideoVisionRotaryEmbedding(nn.Module):
     def __init__(self, config: Sam2VideoConfig):
         super().__init__()
-        dim = config.memory_attention_hidden_size // (
+        self.dim = config.memory_attention_hidden_size // (
             config.memory_attention_downsample_rate * config.memory_attention_num_attention_heads
         )
         # Ensure even dimension for proper axial splitting
-        if dim % 4 != 0:
+        if self.dim % 4 != 0:
             raise ValueError("Dimension must be divisible by 4 for axial RoPE")
-        end_x, end_y = config.memory_attention_rope_feat_sizes
-        freqs = 1.0 / (config.memory_attention_rope_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+        self.end_x, self.end_y = config.memory_attention_rope_feat_sizes
+        self.memory_attention_rope_theta = config.memory_attention_rope_theta
-        # Generate 2D position indices for axial rotary embedding
-        flattened_indices = torch.arange(end_x * end_y, dtype=torch.long)
-        x_positions = flattened_indices % end_x
-        y_positions = torch.div(flattened_indices, end_x, rounding_mode="floor")
-        freqs_x = torch.outer(x_positions, freqs).float()
-        freqs_y = torch.outer(y_positions, freqs).float()
-        inv_freq = torch.cat([freqs_x, freqs_y], dim=-1)
-        inv_freq = inv_freq.repeat_interleave(2, dim=-1)
         # directly register the cos and sin embeddings as we have a fixed feature shape
+        inv_freq = self.create_inv_freq()
         self.register_buffer("rope_embeddings_cos", inv_freq.cos(), persistent=False)
         self.register_buffer("rope_embeddings_sin", inv_freq.sin(), persistent=False)
@@ -724,6 +723,20 @@ class Sam2VideoVisionRotaryEmbedding(nn.Module):
         # As the feature map size is fixed, we can just return the pre-computed embeddings.
         return self.rope_embeddings_cos, self.rope_embeddings_sin
+    def create_inv_freq(self):
+        freqs = 1.0 / (
+            self.memory_attention_rope_theta ** (torch.arange(0, self.dim, 4)[: (self.dim // 4)].float() / self.dim)
+        )
+        # Generate 2D position indices for axial rotary embedding
+        flattened_indices = torch.arange(self.end_x * self.end_y, dtype=torch.long)
+        x_positions = flattened_indices % self.end_x
+        y_positions = torch.div(flattened_indices, self.end_x, rounding_mode="floor")
+        freqs_x = torch.outer(x_positions, freqs).float()
+        freqs_y = torch.outer(y_positions, freqs).float()
+        inv_freq = torch.cat([freqs_x, freqs_y], dim=-1)
+        inv_freq = inv_freq.repeat_interleave(2, dim=-1)
+        return inv_freq
 def rotate_pairwise(x):
     """
@@ -1101,6 +1114,31 @@ class Sam2VideoMemoryEncoder(nn.Module):
         return vision_features, vision_pos_enc
+class Sam2VideoPositionalEmbedding(nn.Module):
+    def __init__(self, config: Sam2VideoPromptEncoderConfig):
+        super().__init__()
+        self.scale = config.scale
+        positional_embedding = self.scale * torch.randn((2, config.hidden_size // 2))
+        self.register_buffer("positional_embedding", positional_embedding)
+    def forward(self, input_coords, input_shape=None):
+        """Positionally encode points that are normalized to [0,1]."""
+        coordinates = input_coords.clone()
+        if input_shape is not None:
+            coordinates[:, :, :, 0] = coordinates[:, :, :, 0] / input_shape[1]
+            coordinates[:, :, :, 1] = coordinates[:, :, :, 1] / input_shape[0]
+        coordinates.to(torch.float32)
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coordinates = 2 * coordinates - 1
+        coordinates = coordinates.to(self.positional_embedding.dtype)
+        coordinates = coordinates @ self.positional_embedding
+        coordinates = 2 * np.pi * coordinates
+        # outputs d_1 x ... x d_n x channel shape
+        return torch.cat([torch.sin(coordinates), torch.cos(coordinates)], dim=-1)
 @dataclass
 @auto_docstring(custom_intro="Base class for the vision encoder's outputs.")
 class Sam2VideoVisionEncoderOutput(ModelOutput):
@@ -1130,31 +1168,6 @@ class Sam2VideoVisionEncoderOutput(ModelOutput):
     attentions: Optional[tuple[torch.FloatTensor, ...]] = None
-class Sam2VideoPositionalEmbedding(nn.Module):
-    def __init__(self, config: Sam2VideoPromptEncoderConfig):
-        super().__init__()
-        self.scale = config.scale
-        positional_embedding = self.scale * torch.randn((2, config.hidden_size // 2))
-        self.register_buffer("positional_embedding", positional_embedding)
-    def forward(self, input_coords, input_shape=None):
-        """Positionally encode points that are normalized to [0,1]."""
-        coordinates = input_coords.clone()
-        if input_shape is not None:
-            coordinates[:, :, :, 0] = coordinates[:, :, :, 0] / input_shape[1]
-            coordinates[:, :, :, 1] = coordinates[:, :, :, 1] / input_shape[0]
-        coordinates.to(torch.float32)
-        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
-        coordinates = 2 * coordinates - 1
-        coordinates = coordinates.to(self.positional_embedding.dtype)
-        coordinates = coordinates @ self.positional_embedding
-        coordinates = 2 * np.pi * coordinates
-        # outputs d_1 x ... x d_n x channel shape
-        return torch.cat([torch.sin(coordinates), torch.cos(coordinates)], dim=-1)
 class Sam2VideoMaskEmbedding(nn.Module):
     def __init__(self, config: Sam2VideoPromptEncoderConfig):
         super().__init__()
@@ -1559,11 +1572,6 @@ class Sam2VideoModel(Sam2VideoPreTrainedModel):
     input_modalities = ("video", "text")
     _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(Sam2VideoTwoWayAttentionBlock, index=2)}
     _keys_to_ignore_on_load_unexpected = []
-    _tied_weights_keys = {
-        "prompt_encoder.shared_embedding.positional_embedding": "shared_image_embedding.positional_embedding"
-    }
-    # need to be ignored, as it's a buffer and will not be correctly detected as tied weight
-    _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"]
     def __init__(self, config: Sam2VideoConfig):
         super().__init__(config)

transformers/models/sam2_video/modular_sam2_video.py CHANGED Viewed

@@ -51,6 +51,7 @@ from ..sam2.modeling_sam2 import (
     Sam2ImageSegmentationOutput,
     Sam2LayerNorm,
     Sam2Model,
+    Sam2PositionalEmbedding,
     Sam2SinePositionEmbedding,
     Sam2TwoWayAttentionBlock,
     eager_attention_forward,
@@ -477,7 +478,7 @@ class Sam2VideoInferenceSession:
         device_inputs = {}
         for key, value in inputs.items():
             if isinstance(value, torch.Tensor):
-                device_inputs[key] = value.to(self.inference_device, non_blocking=True)
+                device_inputs[key] = value.to(self.inference_device, non_blocking=False)
             else:
                 device_inputs[key] = value
         self.point_inputs_per_obj[obj_idx][frame_idx] = device_inputs
@@ -1013,6 +1014,12 @@ class Sam2VideoPreTrainedModel(PreTrainedModel):
         if isinstance(module, Sam2VideoMemoryFuserCXBlock):
             if module.scale is not None:
                 init.zeros_(module.scale)
+        elif isinstance(module, Sam2VideoVisionRotaryEmbedding):
+            inv_freq = module.create_inv_freq()
+            init.copy_(module.rope_embeddings_cos, inv_freq.cos())
+            init.copy_(module.rope_embeddings_sin, inv_freq.sin())
+        elif isinstance(module, Sam2VideoPositionalEmbedding):
+            init.normal_(module.positional_embedding, std=module.scale)
 class Sam2VideoVisionRotaryEmbedding(nn.Module):
@@ -1023,24 +1030,17 @@ class Sam2VideoVisionRotaryEmbedding(nn.Module):
     def __init__(self, config: Sam2VideoConfig):
         super().__init__()
-        dim = config.memory_attention_hidden_size // (
+        self.dim = config.memory_attention_hidden_size // (
             config.memory_attention_downsample_rate * config.memory_attention_num_attention_heads
         )
         # Ensure even dimension for proper axial splitting
-        if dim % 4 != 0:
+        if self.dim % 4 != 0:
             raise ValueError("Dimension must be divisible by 4 for axial RoPE")
-        end_x, end_y = config.memory_attention_rope_feat_sizes
-        freqs = 1.0 / (config.memory_attention_rope_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+        self.end_x, self.end_y = config.memory_attention_rope_feat_sizes
+        self.memory_attention_rope_theta = config.memory_attention_rope_theta
-        # Generate 2D position indices for axial rotary embedding
-        flattened_indices = torch.arange(end_x * end_y, dtype=torch.long)
-        x_positions = flattened_indices % end_x
-        y_positions = torch.div(flattened_indices, end_x, rounding_mode="floor")
-        freqs_x = torch.outer(x_positions, freqs).float()
-        freqs_y = torch.outer(y_positions, freqs).float()
-        inv_freq = torch.cat([freqs_x, freqs_y], dim=-1)
-        inv_freq = inv_freq.repeat_interleave(2, dim=-1)
         # directly register the cos and sin embeddings as we have a fixed feature shape
+        inv_freq = self.create_inv_freq()
         self.register_buffer("rope_embeddings_cos", inv_freq.cos(), persistent=False)
         self.register_buffer("rope_embeddings_sin", inv_freq.sin(), persistent=False)
@@ -1049,6 +1049,20 @@ class Sam2VideoVisionRotaryEmbedding(nn.Module):
         # As the feature map size is fixed, we can just return the pre-computed embeddings.
         return self.rope_embeddings_cos, self.rope_embeddings_sin
+    def create_inv_freq(self):
+        freqs = 1.0 / (
+            self.memory_attention_rope_theta ** (torch.arange(0, self.dim, 4)[: (self.dim // 4)].float() / self.dim)
+        )
+        # Generate 2D position indices for axial rotary embedding
+        flattened_indices = torch.arange(self.end_x * self.end_y, dtype=torch.long)
+        x_positions = flattened_indices % self.end_x
+        y_positions = torch.div(flattened_indices, self.end_x, rounding_mode="floor")
+        freqs_x = torch.outer(x_positions, freqs).float()
+        freqs_y = torch.outer(y_positions, freqs).float()
+        inv_freq = torch.cat([freqs_x, freqs_y], dim=-1)
+        inv_freq = inv_freq.repeat_interleave(2, dim=-1)
+        return inv_freq
 def rotate_pairwise(x):
     """
@@ -1426,6 +1440,10 @@ class Sam2VideoMemoryEncoder(nn.Module):
         return vision_features, vision_pos_enc
+class Sam2VideoPositionalEmbedding(Sam2PositionalEmbedding):
+    pass
 # a large negative value as a placeholder score for missing objects
 NO_OBJ_SCORE = -1024.0
@@ -1446,11 +1464,6 @@ def get_1d_sine_pe(pos_inds, dim, temperature=10000):
 @auto_docstring
 class Sam2VideoModel(Sam2Model):
     input_modalities = ("video", "text")
-    _tied_weights_keys = {
-        "prompt_encoder.shared_embedding.positional_embedding": "shared_image_embedding.positional_embedding"
-    }
-    # need to be ignored, as it's a buffer and will not be correctly detected as tied weight
-    _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"]
     _keys_to_ignore_on_load_unexpected = []
     _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(Sam2VideoTwoWayAttentionBlock, index=2)}

transformers/models/sam3/configuration_sam3.py CHANGED Viewed

@@ -122,7 +122,7 @@ class Sam3VisionConfig(PreTrainedConfig):
     documentation from [`PreTrainedConfig`] for more information.
     Args:
-        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*):
+        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `Sam3ViTConfig()`):
             Configuration for the vision backbone. This is used to instantiate the backbone using
             `AutoModel.from_config`.
         fpn_hidden_size (`int`, *optional*, defaults to 256):
@@ -179,6 +179,16 @@ class Sam3VisionConfig(PreTrainedConfig):
         self.initializer_range = initializer_range
         super().__init__(**kwargs)
+    @property
+    def image_size(self):
+        """Image size for the vision encoder."""
+        return self.backbone_config.image_size
+    @image_size.setter
+    def image_size(self, value):
+        """Set the image size and propagate to backbone."""
+        self.backbone_config.image_size = value
 class Sam3GeometryEncoderConfig(PreTrainedConfig):
     r"""
@@ -506,6 +516,16 @@ class Sam3Config(PreTrainedConfig):
         self.initializer_range = initializer_range
         super().__init__(**kwargs)
+    @property
+    def image_size(self):
+        """Image size for the SAM3 model."""
+        return self.vision_config.image_size
+    @image_size.setter
+    def image_size(self, value):
+        """Set the image size and propagate to vision config."""
+        self.vision_config.image_size = value
 __all__ = [
     "Sam3Config",

transformers/models/sam3/modeling_sam3.py CHANGED Viewed

@@ -417,6 +417,10 @@ class Sam3ViTRotaryEmbedding(nn.Module):
         # Ensure even dimension for proper axial splitting
         if dim % 4 != 0:
             raise ValueError("Dimension must be divisible by 4 for axial RoPE")
+        self.end_x, self.end_y = end_x, end_y
+        self.dim = dim
+        self.rope_theta = config.rope_theta
+        self.scale = scale
         freqs = 1.0 / (config.rope_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
         flattened_indices = torch.arange(end_x * end_y, dtype=torch.long)
@@ -776,6 +780,19 @@ class Sam3PreTrainedModel(PreTrainedModel):
         super()._init_weights(module)
         if isinstance(module, Sam3ViTEmbeddings):
             init.normal_(module.position_embeddings, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, Sam3ViTRotaryEmbedding):
+            end_x, end_y = module.end_x, module.end_y
+            dim = module.dim
+            freqs = 1.0 / (module.rope_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+            flattened_indices = torch.arange(end_x * end_y, dtype=torch.long)
+            x_positions = (flattened_indices % end_x) * module.scale
+            y_positions = torch.div(flattened_indices, end_x, rounding_mode="floor") * module.scale
+            freqs_x = torch.outer(x_positions, freqs).float()
+            freqs_y = torch.outer(y_positions, freqs).float()
+            inv_freq = torch.cat([freqs_x, freqs_y], dim=-1)
+            inv_freq = inv_freq.repeat_interleave(2, dim=-1)
+            init.copy_(module.rope_embeddings_cos, inv_freq.cos())
+            init.copy_(module.rope_embeddings_sin, inv_freq.sin())
 @auto_docstring
@@ -1338,6 +1355,8 @@ class Sam3DetrEncoder(Sam3PreTrainedModel):
         self.layers = nn.ModuleList([Sam3DetrEncoderLayer(config) for _ in range(config.num_layers)])
+        self.post_init()
     def _prepare_multilevel_features(
         self,
         vision_features: list[torch.Tensor],
@@ -1617,6 +1636,8 @@ class Sam3DetrDecoder(Sam3PreTrainedModel):
         self.position_encoding = Sam3SinePositionEmbedding(num_pos_feats=config.hidden_size // 2, normalize=False)
+        self.post_init()
     @compile_compatible_method_lru_cache(maxsize=1)
     def _get_coords(
         self, height: torch.Tensor, width: torch.Tensor, dtype: torch.dtype, device: torch.device
@@ -1987,6 +2008,8 @@ class Sam3MaskDecoder(Sam3PreTrainedModel):
         self.prompt_cross_attn_norm = nn.LayerNorm(hidden_size)
         self.prompt_cross_attn_dropout = nn.Dropout(config.dropout)
+        self.post_init()
     @check_model_inputs
     def forward(
         self,

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl