PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py CHANGED Viewed

@@ -36,12 +36,12 @@ from ... import initialization as init
 from ...activations import ACT2FN
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
-from ...modeling_outputs import BaseModelOutput
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...pytorch_utils import compile_compatible_method_lru_cache
-from ...utils import ModelOutput, auto_docstring
-from ...utils.generic import OutputRecorder, TransformersKwargs
+from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.generic import OutputRecorder, is_flash_attention_requested
 from ..auto import AutoModel
 from .configuration_sam3_tracker_video import (
     Sam3TrackerVideoConfig,
@@ -50,6 +50,9 @@ from .configuration_sam3_tracker_video import (
 )
+logger = logging.get_logger(__name__)
 class Sam3TrackerVideoInferenceCache:
     """Cache for vision features and model constants."""
@@ -475,9 +478,18 @@ class Sam3TrackerVideoAttention(nn.Module):
         key = self.k_proj(key).view(*new_shape).transpose(1, 2)
         value = self.v_proj(value).view(*new_shape).transpose(1, 2)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+        if is_flash_attention_requested(self.config) and attention_similarity is not None:
+            # Target guided masks are represented as float masks and are incompatible with Flash Attention
+            # Fallback to SDPA for this call only so the rest of the model can still benefit from FA
+            attention_interface = ALL_ATTENTION_FUNCTIONS["sdpa"]
+            logger.warning_once(
+                "Falling back to SDPA for target-guided attention because "
+                "Flash Attention does not support additive bias masks."
+            )
         attn_output, attn_weights = attention_interface(
             self,
@@ -499,7 +511,7 @@ class Sam3TrackerVideoAttention(nn.Module):
         return attn_output, attn_weights
-class Sam3TrackerVideoTwoWayAttentionBlock(nn.Module):
+class Sam3TrackerVideoTwoWayAttentionBlock(GradientCheckpointingLayer):
     def __init__(self, config: Sam3TrackerVideoMaskDecoderConfig, skip_first_layer_pe: bool = False):
         """
         A transformer block with four layers:
@@ -674,7 +686,7 @@ class Sam3TrackerVideoPreTrainedModel(PreTrainedModel):
     main_input_name = "pixel_values"
     input_modalities = "video"
     _supports_sdpa = True
-    _supports_flash_attn_2 = True
+    _supports_flash_attn = True
     _supports_attention_backend = True
     @torch.no_grad()
@@ -859,9 +871,9 @@ class Sam3TrackerVideoRoPEAttention(nn.Module):
             query, key, cos, sin, repeat_freqs_k=self.rope_k_repeat, num_k_exclude_rope=num_k_exclude_rope
         )
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -1124,16 +1136,10 @@ class Sam3TrackerVideoMemoryEncoder(nn.Module):
 @dataclass
 @auto_docstring(custom_intro="Base class for the vision encoder's outputs.")
-class Sam3TrackerVideoVisionEncoderOutput(ModelOutput):
+class Sam3TrackerVideoVisionEncoderOutput(BaseModelOutputWithPooling):
     r"""
     last_hidden_state (`torch.FloatTensor` of shape `(batch_size, height, width, hidden_size)`):
         Sequence of hidden-states at the output of the last layer of the model.
-    fpn_hidden_states (`tuple(torch.FloatTensor)`):
-        Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
-        `(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck.
-    fpn_position_encoding (`tuple(torch.FloatTensor)`):
-        Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
-        `(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`.
     hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
         Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
         one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`. Hidden-states of the
@@ -1142,13 +1148,16 @@ class Sam3TrackerVideoVisionEncoderOutput(ModelOutput):
         Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
         sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
         the self-attention heads.
+    fpn_hidden_states (`tuple(torch.FloatTensor)`):
+        Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
+        `(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck.
+    fpn_position_encoding (`tuple(torch.FloatTensor)`):
+        Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
+        `(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`.
     """
-    last_hidden_state: torch.FloatTensor | None = None
     fpn_hidden_states: torch.FloatTensor | None = None
     fpn_position_encoding: torch.FloatTensor | None = None
-    hidden_states: tuple[torch.FloatTensor, ...] | None = None
-    attentions: tuple[torch.FloatTensor, ...] | None = None
 class Sam3TrackerVideoPositionalEmbedding(nn.Module):
@@ -1579,6 +1588,7 @@ def get_1d_sine_pe(pos_inds, dim, temperature=10000):
 class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
     input_modalities = ("video", "text")
     _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(Sam3TrackerVideoTwoWayAttentionBlock, index=2)}
+    _tied_weights_keys = {}
     _keys_to_ignore_on_load_unexpected = [r"^detector_model."]
     _checkpoint_conversion_mapping = {
         r"tracker_model.(.+)": r"\1",  # the regex allows to remove the prefix, and add it back in revert mode
@@ -1675,7 +1685,8 @@ class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
                 Input pixel values
         """
         batch_size = pixel_values.shape[0]
-        feature_maps, _, _, _ = self.get_image_features(pixel_values, **kwargs)
+        image_outputs = self.get_image_features(pixel_values, return_dict=True, **kwargs)
+        feature_maps = image_outputs.fpn_hidden_states
         # add no memory embedding to the last feature map
         feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding
@@ -1846,33 +1857,19 @@ class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
             frame_idx=frame_idx,
         )
+    @can_return_tuple
+    @auto_docstring
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[
-        list[torch.Tensor],
-        list[torch.Tensor],
-        tuple[torch.FloatTensor, ...] | None,
-        tuple[torch.FloatTensor, ...] | None,
-    ]:
+    ) -> tuple | Sam3TrackerVideoVisionEncoderOutput:
         r"""
-        Extract and preprocess image features using the vision encoder.
-        Args:
-            pixel_values (`torch.FloatTensor`):
-                Input pixel values of shape `(batch_size, num_channels, height, width)`.
-        Returns:
-            `tuple`: A tuple containing:
-                - feature_maps (`list[torch.Tensor]`): List of feature maps from different levels.
-                - feature_maps_position_embeddings (`list[torch.Tensor]`): List of positional embeddings for each feature level.
-                - vision_hidden_states (`tuple[torch.FloatTensor]`, *optional*): Hidden states from the vision encoder.
-                - vision_attentions (`tuple[torch.FloatTensor]`, *optional*): Attention weights from the vision encoder.
+        pixel_values (`torch.FloatTensor`):
+            Input pixel values of shape `(batch_size, num_channels, height, width)`.
         """
         vision_outputs: Sam3TrackerVideoVisionEncoderOutput = self.vision_encoder(
-            pixel_values,
-            **kwargs,
+            pixel_values, return_dict=True, **kwargs
         )
         feature_maps = vision_outputs.fpn_hidden_states
@@ -1890,8 +1887,10 @@ class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
             feature_map_position_embedding.flatten(2).permute(2, 0, 1)
             for feature_map_position_embedding in feature_maps_position_embeddings[:-1]
         ]
+        vision_outputs.fpn_hidden_states = feature_maps
+        vision_outputs.fpn_position_encoding = feature_maps_position_embeddings
-        return feature_maps, feature_maps_position_embeddings, vision_outputs.hidden_states, vision_outputs.attentions
+        return vision_outputs
     def _prepare_vision_features(
         self,
@@ -1908,7 +1907,9 @@ class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
         else:
             # Compute features using image encoder
             image_batch = inference_session.get_frame(frame_idx).unsqueeze(0)  # Add batch dimension
-            vision_feats, vision_pos_embeds, _, _ = self.get_image_features(image_batch)
+            image_outputs = self.get_image_features(image_batch, return_dict=True)
+            vision_feats = image_outputs.fpn_hidden_states
+            vision_pos_embeds = image_outputs.fpn_position_encoding
             # Cache features
             inference_session.cache.cache_vision_features(
                 frame_idx, {"vision_feats": vision_feats, "vision_pos_embeds": vision_pos_embeds}
@@ -2013,10 +2014,10 @@ class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
         vision_hidden_states = None
         if pixel_values is not None:
-            feature_maps, _, vision_hidden_states, vision_attentions = self.get_image_features(
-                pixel_values,
-                **kwargs,
-            )
+            image_outputs = self.get_image_features(pixel_values, return_dict=True, **kwargs)
+            feature_maps = image_outputs.fpn_hidden_states
+            vision_hidden_states = image_outputs.hidden_states
+            vision_attentions = image_outputs.attentions
             # add no memory embedding to the last feature map
             feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding
@@ -2506,7 +2507,7 @@ class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
                 num_object_pointer_tokens = object_pointers.shape[0]
         # Step 4: Concatenate all retrieved memories and their positional embeddings
-        combined_memory = torch.cat(memories_to_concatenate, dim=0)
+        combined_memory = torch.cat(memories_to_concatenate, dim=0).to(dtype=inference_session.dtype)
         combined_memory_positional_embeddings = torch.cat(memory_positional_embeddings_to_concatenate, dim=0)
         # Step 5: Forward through the memory attention mechanism

transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py CHANGED Viewed

@@ -17,7 +17,7 @@ import torch
 from ...configuration_utils import PreTrainedConfig
 from ...processing_utils import Unpack
-from ...utils.generic import TransformersKwargs
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
 from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel
 from ..sam2_video.configuration_sam2_video import Sam2VideoMaskDecoderConfig, Sam2VideoPromptEncoderConfig
 from ..sam2_video.modeling_sam2_video import (
@@ -544,33 +544,19 @@ class Sam3TrackerVideoModel(Sam2VideoModel):
         self.post_init()
+    @can_return_tuple
+    @auto_docstring
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[
-        list[torch.Tensor],
-        list[torch.Tensor],
-        tuple[torch.FloatTensor, ...] | None,
-        tuple[torch.FloatTensor, ...] | None,
-    ]:
+    ) -> tuple | Sam3TrackerVideoVisionEncoderOutput:
         r"""
-        Extract and preprocess image features using the vision encoder.
-        Args:
-            pixel_values (`torch.FloatTensor`):
-                Input pixel values of shape `(batch_size, num_channels, height, width)`.
-        Returns:
-            `tuple`: A tuple containing:
-                - feature_maps (`list[torch.Tensor]`): List of feature maps from different levels.
-                - feature_maps_position_embeddings (`list[torch.Tensor]`): List of positional embeddings for each feature level.
-                - vision_hidden_states (`tuple[torch.FloatTensor]`, *optional*): Hidden states from the vision encoder.
-                - vision_attentions (`tuple[torch.FloatTensor]`, *optional*): Attention weights from the vision encoder.
+        pixel_values (`torch.FloatTensor`):
+            Input pixel values of shape `(batch_size, num_channels, height, width)`.
         """
         vision_outputs: Sam3TrackerVideoVisionEncoderOutput = self.vision_encoder(
-            pixel_values,
-            **kwargs,
+            pixel_values, return_dict=True, **kwargs
         )
         feature_maps = vision_outputs.fpn_hidden_states
@@ -588,8 +574,10 @@ class Sam3TrackerVideoModel(Sam2VideoModel):
             feature_map_position_embedding.flatten(2).permute(2, 0, 1)
             for feature_map_position_embedding in feature_maps_position_embeddings[:-1]
         ]
+        vision_outputs.fpn_hidden_states = feature_maps
+        vision_outputs.fpn_position_encoding = feature_maps_position_embeddings
-        return feature_maps, feature_maps_position_embeddings, vision_outputs.hidden_states, vision_outputs.attentions
+        return vision_outputs
 __all__ = [

transformers/models/sam3_video/modeling_sam3_video.py CHANGED Viewed

@@ -14,6 +14,7 @@
 from collections import OrderedDict, defaultdict
+from collections.abc import Iterator
 from copy import deepcopy
 from dataclasses import dataclass
 from typing import Any
@@ -54,7 +55,7 @@ def _load_cv_utils_kernel_once():
         return
     try:
-        cv_utils_kernel = get_kernel("kernels-community/cv_utils")
+        cv_utils_kernel = get_kernel("kernels-community/cv-utils")
     except Exception as e:
         logger.warning_once(
             f"Failed to load cv_utils kernel (your torch/cuda setup may not be supported): {e}. "
@@ -590,7 +591,8 @@ class Sam3VideoModel(Sam3VideoPreTrainedModel):
                 text_embeds = self.detector_model.get_text_features(
                     input_ids=inference_session.prompt_input_ids[prompt_id],
                     attention_mask=inference_session.prompt_attention_masks[prompt_id],
-                )
+                    return_dict=True,
+                ).pooler_output
                 inference_session.prompt_embeddings[prompt_id] = text_embeds
             else:
                 text_embeds = inference_session.prompt_embeddings[prompt_id]
@@ -1780,20 +1782,31 @@ class Sam3VideoModel(Sam3VideoPreTrainedModel):
         return processing_order, end_frame_idx
     @torch.inference_mode()
+    @auto_docstring(
+        custom_intro="""
+        Propagate the prompts to get grounding results for the entire video. Used when initializing an inference session with a whole video.
+        Yields Sam3VideoSegmentationOutput for each frame.
+        """
+    )
     def propagate_in_video_iterator(
         self,
         inference_session: Sam3VideoInferenceSession,
-        start_frame_idx=0,
-        max_frame_num_to_track=None,
-        reverse=False,
-    ):
-        """
-        Propagate the prompts to get grounding results for the entire video. This method
-        is a generator and yields inference outputs for all frames in the range specified
-        by `start_frame_idx`, `max_frame_num_to_track`, and `reverse`.
-        Yields:
-            `Sam3VideoSegmentationOutput`: The segmentation output for each frame.
+        start_frame_idx: int = 0,
+        max_frame_num_to_track: int | None = None,
+        reverse: bool = False,
+        show_progress_bar: bool = False,
+    ) -> Iterator[Sam3VideoSegmentationOutput]:
+        r"""
+        inference_session (`Sam3VideoInferenceSession`):
+            The video inference session object.
+        start_frame_idx (`int`, *optional*, defaults to `0`):
+            The starting frame index for propagation.
+        max_frame_num_to_track (`int`, *optional*):
+            The maximum number of frames to track. If not provided, all frames in the video will be tracked.
+        reverse (`bool`, *optional*, defaults to `False`):
+            Whether to propagate in reverse.
+        show_progress_bar (`bool`, *optional*, defaults to `False`):
+            Whether to show a progress bar during propagation.
         """
         processing_order, end_frame_idx = self._get_processing_order(
             inference_session,
@@ -1803,7 +1816,7 @@ class Sam3VideoModel(Sam3VideoPreTrainedModel):
         )
         hotstart_buffer = []
-        for frame_idx in tqdm(processing_order):
+        for frame_idx in tqdm(processing_order, desc="propagate in video", disable=not show_progress_bar):
             out = self(inference_session=inference_session, frame_idx=frame_idx, reverse=reverse)
             if self.hotstart_delay > 0:

transformers/models/sam_hq/configuration_sam_hq.py CHANGED Viewed

@@ -290,6 +290,7 @@ class SamHQConfig(PreTrainedConfig):
         prompt_encoder_config=None,
         mask_decoder_config=None,
         initializer_range=0.02,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         vision_config = vision_config if vision_config is not None else {}
@@ -307,6 +308,7 @@ class SamHQConfig(PreTrainedConfig):
         self.prompt_encoder_config = SamHQPromptEncoderConfig(**prompt_encoder_config)
         self.mask_decoder_config = SamHQMaskDecoderConfig(**mask_decoder_config)
         self.initializer_range = initializer_range
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)

transformers/models/sam_hq/modeling_sam_hq.py CHANGED Viewed

@@ -415,7 +415,7 @@ class SamHQPositionalEmbedding(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.scale = config.scale
-        self.register_buffer("positional_embedding", self.scale * torch.randn((2, config.num_pos_feats)))
+        self.positional_embedding = nn.Parameter(self.scale * torch.randn((2, config.num_pos_feats)))
     def forward(self, input_coords, input_shape=None):
         """Positionally encode points that are normalized to [0,1]."""
@@ -685,9 +685,9 @@ class SamHQAttention(nn.Module):
         value = self._separate_heads(value, self.num_attention_heads)
         # SamHQAttention
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -1233,7 +1233,9 @@ class SamHQPromptEncoder(nn.Module):
 class SamHQModel(SamHQPreTrainedModel):
     input_modalities = ("image", "text")
     _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(SamHQTwoWayAttentionBlock, index=2)}
-    _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"]
+    _tied_weights_keys = {
+        "prompt_encoder.shared_embedding.positional_embedding": "shared_image_embedding.positional_embedding"
+    }
     def __init__(self, config):
         super().__init__(config)
@@ -1393,16 +1395,18 @@ class SamHQModel(SamHQPreTrainedModel):
         ```python
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> from transformers import AutoModel, AutoProcessor
         >>> model = AutoModel.from_pretrained("sushmanth/sam_hq_vit_b")
         >>> processor = AutoProcessor.from_pretrained("sushmanth/sam_hq_vit_b")
-        >>> img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
-        >>> raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read())).convert("RGB")
         >>> input_points = [[[400, 650]]]  # 2D location of a window on the car
-        >>> inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt")
+        >>> inputs = processor(images=image, input_points=input_points, return_tensors="pt")
         >>> # Get high-quality segmentation mask
         >>> outputs = model(**inputs)

transformers/models/sam_hq/modular_sam_hq.py CHANGED Viewed

@@ -440,8 +440,6 @@ class SamHQVisionModel(SamVisionModel):
     """
 )
 class SamHQModel(SamModel):
-    _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"]
     def __init__(self, config):
         super().__init__(config)
         self.vision_encoder = SamHQVisionEncoder(config.vision_config)
@@ -546,16 +544,18 @@ class SamHQModel(SamModel):
         ```python
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> from transformers import AutoModel, AutoProcessor
         >>> model = AutoModel.from_pretrained("sushmanth/sam_hq_vit_b")
         >>> processor = AutoProcessor.from_pretrained("sushmanth/sam_hq_vit_b")
-        >>> img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
-        >>> raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read())).convert("RGB")
         >>> input_points = [[[400, 650]]]  # 2D location of a window on the car
-        >>> inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt")
+        >>> inputs = processor(images=image, input_points=input_points, return_tensors="pt")
         >>> # Get high-quality segmentation mask
         >>> outputs = model(**inputs)

transformers/models/sam_hq/processing_sam_hq.py CHANGED Viewed

@@ -16,19 +16,20 @@ Processor class for SAMHQ.
 """
 from copy import deepcopy
+from typing import Union
 import numpy as np
+from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
-from ...tokenization_utils_base import BatchEncoding
 from ...utils import auto_docstring, is_torch_available
 if is_torch_available():
     import torch
-NestedList = list[float | int | None | list[float | int | None]]
+NestedList = list[Union[float | int | None, "NestedList"]]
 class SamHQImagesKwargs(ImagesKwargs, total=False):
@@ -61,9 +62,9 @@ class SamHQImagesKwargs(ImagesKwargs, total=False):
     """
     segmentation_maps: ImageInput | None
-    input_points: NestedList | None
-    input_labels: NestedList | None
-    input_boxes: NestedList | None
+    input_points: "NestedList | torch.Tensor | None"
+    input_labels: "NestedList | int | torch.Tensor | None"
+    input_boxes: "NestedList | torch.Tensor | None"
     point_pad_value: int | None
     mask_size: dict[str, int]
     mask_pad_size: dict[str, int]
@@ -94,7 +95,7 @@ class SamHQProcessor(ProcessorMixin):
         self,
         images: ImageInput | None = None,
         **kwargs: Unpack[SamHQProcessorKwargs],
-    ) -> BatchEncoding:
+    ) -> BatchFeature:
         output_kwargs = self._merge_kwargs(
             SamHQProcessorKwargs,
             tokenizer_init_kwargs={},

transformers/models/seamless_m4t/configuration_seamless_m4t.py CHANGED Viewed

@@ -316,6 +316,7 @@ class SeamlessM4TConfig(PreTrainedConfig):
         variance_predictor_kernel_size=3,
         var_pred_dropout=0.5,
         vocoder_offset=4,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         # overall_config
@@ -334,6 +335,7 @@ class SeamlessM4TConfig(PreTrainedConfig):
         self.attention_dropout = attention_dropout
         self.activation_dropout = activation_dropout
         self.scale_embedding = scale_embedding
+        self.tie_word_embeddings = tie_word_embeddings
         # for proper config init
         self.num_attention_heads = decoder_attention_heads
         self.num_hidden_layers = decoder_layers
@@ -400,16 +402,13 @@ class SeamlessM4TConfig(PreTrainedConfig):
         self.variance_predictor_kernel_size = variance_predictor_kernel_size
         self.var_pred_dropout = var_pred_dropout
         self.vocoder_offset = vocoder_offset
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.max_position_embeddings = max_position_embeddings
+        self.decoder_start_token_id = decoder_start_token_id
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            decoder_start_token_id=decoder_start_token_id,
-            is_encoder_decoder=is_encoder_decoder,
-            max_position_embeddings=max_position_embeddings,
-            **kwargs,
-        )
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 __all__ = ["SeamlessM4TConfig"]

transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py CHANGED Viewed

@@ -322,6 +322,7 @@ class SeamlessM4Tv2Config(PreTrainedConfig):
         variance_predictor_kernel_size=3,
         var_pred_dropout=0.5,
         vocoder_offset=4,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         # overall_config
@@ -341,6 +342,7 @@ class SeamlessM4Tv2Config(PreTrainedConfig):
         self.attention_dropout = attention_dropout
         self.activation_dropout = activation_dropout
         self.scale_embedding = scale_embedding
+        self.tie_word_embeddings = tie_word_embeddings
         # for proper config init
         self.num_attention_heads = decoder_attention_heads
         self.num_hidden_layers = decoder_layers
@@ -409,16 +411,13 @@ class SeamlessM4Tv2Config(PreTrainedConfig):
         self.variance_predictor_kernel_size = variance_predictor_kernel_size
         self.var_pred_dropout = var_pred_dropout
         self.vocoder_offset = vocoder_offset
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.max_position_embeddings = max_position_embeddings
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            decoder_start_token_id=decoder_start_token_id,
-            is_encoder_decoder=is_encoder_decoder,
-            max_position_embeddings=max_position_embeddings,
-            **kwargs,
-        )
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 __all__ = ["SeamlessM4Tv2Config"]

transformers/models/seed_oss/configuration_seed_oss.py CHANGED Viewed

@@ -13,8 +13,8 @@
 # limitations under the License.
 """SeedOss model configuration"""
-from transformers.configuration_utils import PreTrainedConfig
-from transformers.modeling_rope_utils import RopeParameters
+from ...configuration_utils import PreTrainedConfig
+from ...modeling_rope_utils import RopeParameters
 class SeedOssConfig(PreTrainedConfig):
@@ -170,13 +170,11 @@ class SeedOssConfig(PreTrainedConfig):
         self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
         self.rope_parameters = rope_parameters
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(**kwargs)
 __all__ = ["SeedOssConfig"]

transformers/models/seed_oss/modeling_seed_oss.py CHANGED Viewed

@@ -206,9 +206,9 @@ class SeedOssAttention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -440,7 +440,7 @@ class SeedOssModel(SeedOssPreTrainedModel):
 @auto_docstring
 class SeedOssForCausalLM(SeedOssPreTrainedModel, GenerationMixin):
     _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
-    _tp_plan = {"lm_head": "colwise_rep"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     def __init__(self, config):

transformers/models/seed_oss/modular_seed_oss.py CHANGED Viewed

@@ -118,9 +118,9 @@ class SeedOssAttention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl