PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/segformer/image_processing_segformer_fast.py CHANGED Viewed

@@ -21,7 +21,7 @@
 from typing import Optional, Union
 import torch
-from torchvision.transforms.v2 import functional as F
+import torchvision.transforms.v2.functional as tvF
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
@@ -66,7 +66,7 @@ class SegformerImageProcessorFast(BaseImageProcessorFast):
             label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype), label)
             labels[idx] = label
-        return label
+        return labels
     @auto_docstring
     def preprocess(
@@ -114,7 +114,7 @@ class SegformerImageProcessorFast(BaseImageProcessorFast):
                     "do_normalize": False,
                     "do_rescale": False,
                     # Nearest interpolation is used for segmentation maps instead of BILINEAR.
-                    "interpolation": F.InterpolationMode.NEAREST_EXACT,
+                    "interpolation": tvF.InterpolationMode.NEAREST_EXACT,
                 }
             )
             processed_segmentation_maps = self._preprocess(
@@ -128,7 +128,7 @@ class SegformerImageProcessorFast(BaseImageProcessorFast):
         self,
         images: list["torch.Tensor"],
         do_reduce_labels: bool,
-        interpolation: Optional["F.InterpolationMode"],
+        interpolation: Optional["tvF.InterpolationMode"],
         do_resize: bool,
         do_rescale: bool,
         do_normalize: bool,

transformers/models/segformer/modeling_segformer.py CHANGED Viewed

@@ -639,13 +639,15 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
         ```python
         >>> from transformers import AutoImageProcessor, SegformerForSemanticSegmentation
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
         >>> model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> inputs = image_processor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)

transformers/models/segformer/modular_segformer.py CHANGED Viewed

@@ -16,7 +16,7 @@
 from typing import Optional, Union
 import torch
-from torchvision.transforms.v2 import functional as F
+import torchvision.transforms.v2.functional as tvF
 from transformers.models.beit.image_processing_beit_fast import BeitImageProcessorFast
@@ -86,7 +86,7 @@ class SegformerImageProcessorFast(BeitImageProcessorFast):
                     "do_normalize": False,
                     "do_rescale": False,
                     # Nearest interpolation is used for segmentation maps instead of BILINEAR.
-                    "interpolation": F.InterpolationMode.NEAREST_EXACT,
+                    "interpolation": tvF.InterpolationMode.NEAREST_EXACT,
                 }
             )
             processed_segmentation_maps = self._preprocess(
@@ -100,7 +100,7 @@ class SegformerImageProcessorFast(BeitImageProcessorFast):
         self,
         images: list["torch.Tensor"],
         do_reduce_labels: bool,
-        interpolation: Optional["F.InterpolationMode"],
+        interpolation: Optional["tvF.InterpolationMode"],
         do_resize: bool,
         do_rescale: bool,
         do_normalize: bool,

transformers/models/seggpt/modeling_seggpt.py CHANGED Viewed

@@ -671,15 +671,21 @@ class SegGptModel(SegGptPreTrainedModel):
         ```python
         >>> from transformers import SegGptImageProcessor, SegGptModel
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
         >>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
         >>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"
-        >>> image_input = Image.open(requests.get(image_input_url, stream=True).raw)
-        >>> image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
-        >>> mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw).convert("L")
+        >>> with httpx.stream("GET", image_input_url) as response:
+        ...     image_input = Image.open(BytesIO(response.read()))
+        >>> with httpx.stream("GET", image_prompt_url) as response:
+        ...     image_prompt = Image.open(BytesIO(response.read()))
+        >>> with httpx.stream("GET", mask_prompt_url) as response:
+        ...     mask_prompt = Image.open(BytesIO(response.read())).convert("L")
         >>> checkpoint = "BAAI/seggpt-vit-large"
         >>> model = SegGptModel.from_pretrained(checkpoint)
@@ -868,15 +874,21 @@ class SegGptForImageSegmentation(SegGptPreTrainedModel):
         ```python
         >>> from transformers import SegGptImageProcessor, SegGptForImageSegmentation
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
         >>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
         >>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"
-        >>> image_input = Image.open(requests.get(image_input_url, stream=True).raw)
-        >>> image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
-        >>> mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw).convert("L")
+        >>> with httpx.stream("GET", image_input_url) as response:
+        ...     image_input = Image.open(BytesIO(response.read()))
+        >>> with httpx.stream("GET", image_prompt_url) as response:
+        ...     image_prompt = Image.open(BytesIO(response.read()))
+        >>> with httpx.stream("GET", mask_prompt_url) as response:
+        ...     mask_prompt = Image.open(BytesIO(response.read())).convert("L")
         >>> checkpoint = "BAAI/seggpt-vit-large"
         >>> model = SegGptForImageSegmentation.from_pretrained(checkpoint)

transformers/models/sew/configuration_sew.py CHANGED Viewed

@@ -192,7 +192,10 @@ class SEWConfig(PreTrainedConfig):
         eos_token_id=2,
         **kwargs,
     ):
-        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.hidden_size = hidden_size
         self.feat_extract_norm = feat_extract_norm
         self.feat_extract_activation = feat_extract_activation

transformers/models/sew/modeling_sew.py CHANGED Viewed

@@ -33,9 +33,10 @@ from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
-from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel, get_torch_context_manager_or_global_device
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, logging
+from ...utils.generic import is_flash_attention_requested
 from .configuration_sew import SEWConfig
@@ -321,9 +322,9 @@ class SEWAttention(nn.Module):
         key_states = self.k_proj(current_states).view(*kv_input_shape).transpose(1, 2)
         value_states = self.v_proj(current_states).view(*kv_input_shape).transpose(1, 2)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -428,7 +429,7 @@ class SEWEncoder(nn.Module):
         if attention_mask is not None:
             expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
-            if self.config._attn_implementation == "flash_attention_2":
+            if is_flash_attention_requested(self.config):
                 # make sure padded tokens output 0
                 hidden_states[~expand_attention_mask] = 0.0
                 # 2d mask is passed through the layers
@@ -864,6 +865,9 @@ class SEWForCTC(SEWPreTrainedModel):
         This method is **not** supposed to be called by the user and is prone to be changed in the future.
         """
+        if get_torch_context_manager_or_global_device() == torch.device("meta"):
+            return
         # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
         # correctly load adapter layers for SEW so that we do not have to introduce a new API to
         # [`PreTrainedModel`]. While slightly hacky, SEW never has to tie input and output embeddings, so that it is

transformers/models/sew/modular_sew.py CHANGED Viewed

@@ -25,6 +25,7 @@ from ...integrations.fsdp import is_fsdp_managed_module
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring
+from ...utils.generic import is_flash_attention_requested
 from ..wav2vec2.modeling_wav2vec2 import (
     Wav2Vec2Attention,
     Wav2Vec2EncoderLayer,
@@ -166,7 +167,7 @@ class SEWEncoder(nn.Module):
         if attention_mask is not None:
             expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
-            if self.config._attn_implementation == "flash_attention_2":
+            if is_flash_attention_requested(self.config):
                 # make sure padded tokens output 0
                 hidden_states[~expand_attention_mask] = 0.0
                 # 2d mask is passed through the layers

transformers/models/sew_d/configuration_sew_d.py CHANGED Viewed

@@ -213,7 +213,10 @@ class SEWDConfig(PreTrainedConfig):
         eos_token_id=2,
         **kwargs,
     ):
-        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.hidden_size = hidden_size
         self.feat_extract_norm = feat_extract_norm
         self.feat_extract_activation = feat_extract_activation

transformers/models/sew_d/modeling_sew_d.py CHANGED Viewed

@@ -26,7 +26,7 @@ from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import PreTrainedModel, get_torch_context_manager_or_global_device
 from ...pytorch_utils import softmax_backward_data
 from ...utils import auto_docstring, logging
 from .configuration_sew_d import SEWDConfig
@@ -1407,6 +1407,9 @@ class SEWDForCTC(SEWDPreTrainedModel):
         This method is **not** supposed to be called by the user and is prone to be changed in the future.
         """
+        if get_torch_context_manager_or_global_device() == torch.device("meta"):
+            return
         # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
         # correctly load adapter layers for SEWD so that we do not have to introduce a new API to
         # [`PreTrainedModel`]. While slightly hacky, SEWD never has to tie input and output embeddings, so that it is

transformers/models/shieldgemma2/modeling_shieldgemma2.py CHANGED Viewed

@@ -58,16 +58,16 @@ class ShieldGemma2ForImageClassification(PreTrainedModel):
         self.post_init()
     def get_input_embeddings(self):
-        return self.model.language_model.get_input_embeddings()
+        return self.model.get_decoder().get_input_embeddings()
     def set_input_embeddings(self, value):
-        self.model.language_model.set_input_embeddings(value)
+        self.model.get_decoder().set_input_embeddings(value)
     def get_output_embeddings(self):
-        return self.model.language_model.get_output_embeddings()
+        return self.model.get_decoder().get_output_embeddings()
     def set_output_embeddings(self, new_embeddings):
-        self.model.language_model.set_output_embeddings(new_embeddings)
+        self.model.get_decoder().set_output_embeddings(new_embeddings)
     @auto_docstring
     def forward(

transformers/models/siglip/configuration_siglip.py CHANGED Viewed

@@ -98,7 +98,10 @@ class SiglipTextConfig(PreTrainedConfig):
         projection_size=None,
         **kwargs,
     ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size

transformers/models/siglip/modeling_siglip.py CHANGED Viewed

@@ -13,7 +13,6 @@
 # limitations under the License.
 """PyTorch Siglip model."""
-import math
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Any
@@ -21,7 +20,6 @@ from typing import Any
 import numpy as np
 import torch
 from torch import nn
-from torch.nn.init import _calculate_fan_in_and_fan_out
 from ... import initialization as init
 from ...activations import ACT2FN
@@ -35,43 +33,12 @@ from ...utils import (
     TransformersKwargs,
     auto_docstring,
     can_return_tuple,
-    filter_out_non_signature_kwargs,
     torch_int,
 )
-from ...utils.generic import check_model_inputs
+from ...utils.generic import check_model_inputs, is_flash_attention_requested
 from .configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
-def variance_scaling_(tensor, mode="fan_in", distribution="normal"):
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
-    if mode == "fan_in":
-        denom = fan_in
-    elif mode == "fan_out":
-        denom = fan_out
-    elif mode == "fan_avg":
-        denom = (fan_in + fan_out) / 2
-    variance = 1.0 / denom
-    if distribution == "truncated_normal":
-        init.trunc_normal_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
-    elif distribution == "normal":
-        init.normal_(tensor, std=math.sqrt(variance))
-    elif distribution == "uniform":
-        bound = math.sqrt(3 * variance)
-        init.uniform_(tensor, -bound, bound)
-    else:
-        raise ValueError(f"invalid distribution {distribution}")
-def lecun_normal_(tensor):
-    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
-def default_flax_embed_init(tensor):
-    variance_scaling_(tensor, mode="fan_in", distribution="normal")
 @dataclass
 @auto_docstring(
     custom_intro="""
@@ -325,9 +292,9 @@ class SiglipAttention(nn.Module):
         keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
         values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -432,7 +399,7 @@ class SiglipPreTrainedModel(PreTrainedModel):
             if hasattr(module, "position_ids"):
                 init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
         elif isinstance(module, nn.Embedding):
-            default_flax_embed_init(module.weight)
+            init.default_flax_embed_init_(module.weight)
         elif isinstance(module, SiglipAttention):
             init.xavier_uniform_(module.q_proj.weight)
             init.xavier_uniform_(module.k_proj.weight)
@@ -460,7 +427,7 @@ class SiglipPreTrainedModel(PreTrainedModel):
                 std=self.config.vision_config.hidden_size**-0.5 * self.config.initializer_factor,
             )
         elif isinstance(module, (nn.Linear, nn.Conv2d)):
-            lecun_normal_(module.weight)
+            init.lecun_normal_(module.weight)
             if module.bias is not None:
                 init.zeros_(module.bias)
         elif isinstance(module, nn.LayerNorm):
@@ -538,7 +505,7 @@ class SiglipTextTransformer(SiglipPreTrainedModel):
         # note: SigLIP's text model does not use a causal mask, unlike the original CLIP model.
         # expand attention_mask
-        uses_flash_attention = "flash" in self.config._attn_implementation
+        uses_flash_attention = is_flash_attention_requested(self.config)
         if uses_flash_attention:
             attention_mask = None
         elif attention_mask is not None and not uses_flash_attention:
@@ -621,10 +588,6 @@ class SiglipTextModel(SiglipPreTrainedModel):
 class SiglipVisionTransformer(SiglipPreTrainedModel):
     _input_embed_layer = "patch_embedding"
-    _can_record_outputs = {
-        "hidden_states": SiglipEncoderLayer,
-        "attentions": SiglipAttention,
-    }
     def __init__(self, config: SiglipVisionConfig):
         super().__init__(config)
@@ -640,7 +603,6 @@ class SiglipVisionTransformer(SiglipPreTrainedModel):
         self.post_init()
-    @check_model_inputs(tie_last_hidden_states=False)
     @auto_docstring
     def forward(
         self,
@@ -724,14 +686,16 @@ class SiglipVisionModel(SiglipPreTrainedModel):
         ```python
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> from transformers import AutoProcessor, SiglipVisionModel
         >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
         >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> inputs = processor(images=image, return_tensors="pt")
@@ -789,19 +753,16 @@ class SiglipModel(SiglipPreTrainedModel):
     def set_input_embeddings(self, value: nn.Module):
         self.text_model.embeddings.token_embedding = value
-    @filter_out_non_signature_kwargs()
+    @can_return_tuple
     @auto_docstring
     def get_text_features(
         self,
         input_ids: torch.Tensor,
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
-    ) -> torch.FloatTensor:
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | BaseModelOutputWithPooling:
         r"""
-        Returns:
-            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
-            applying the projection layer to the pooled output of [`SiglipTextModel`].
         Examples:
         ```python
@@ -816,28 +777,22 @@ class SiglipModel(SiglipPreTrainedModel):
         >>> with torch.no_grad():
         ...     text_features = model.get_text_features(**inputs)
         ```"""
-        text_outputs: BaseModelOutputWithPooling = self.text_model(
+        return self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
+            **kwargs,
         )
-        pooled_output = text_outputs.pooler_output
-        return pooled_output
-    @filter_out_non_signature_kwargs()
+    @can_return_tuple
     @auto_docstring
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> torch.FloatTensor:
+    ) -> tuple | BaseModelOutputWithPooling:
         r"""
-        Returns:
-            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
-            applying the projection layer to the pooled output of [`SiglipVisionModel`].
         Examples:
         ```python
@@ -856,14 +811,11 @@ class SiglipModel(SiglipPreTrainedModel):
         >>> with torch.no_grad():
         ...     image_features = model.get_image_features(**inputs)
         ```"""
-        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
+        return self.vision_model(
             pixel_values=pixel_values,
             interpolate_pos_encoding=interpolate_pos_encoding,
             **kwargs,
         )
-        pooled_output = vision_outputs.pooler_output
-        return pooled_output
     # NOTE: SiglipModel uses Pretrained backbones, so we don't need to add `check_model_inputs` here
     @can_return_tuple
@@ -886,7 +838,8 @@ class SiglipModel(SiglipPreTrainedModel):
         ```python
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> from transformers import AutoProcessor, AutoModel
         >>> import torch
@@ -894,7 +847,8 @@ class SiglipModel(SiglipPreTrainedModel):
         >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
         >>> # important: we pass `padding=max_length` since the model was trained with this
@@ -1011,11 +965,13 @@ class SiglipForImageClassification(SiglipPreTrainedModel):
         >>> from transformers import AutoImageProcessor, SiglipForImageClassification
         >>> import torch
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> # note: we are loading a `SiglipModel` from the hub here,
         >>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.

transformers/models/siglip2/__init__.py CHANGED Viewed

@@ -23,6 +23,7 @@ if TYPE_CHECKING:
     from .image_processing_siglip2_fast import *
     from .modeling_siglip2 import *
     from .processing_siglip2 import *
+    from .tokenization_siglip2 import *
 else:
     import sys

transformers/models/siglip2/configuration_siglip2.py CHANGED Viewed

@@ -18,7 +18,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from ...configuration_utils import PreTrainedConfig
 from ...utils import logging
@@ -104,7 +103,10 @@ class Siglip2TextConfig(PreTrainedConfig):
         projection_size=None,
         **kwargs,
     ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size

transformers/models/siglip2/image_processing_siglip2_fast.py CHANGED Viewed

@@ -16,7 +16,7 @@
 from typing import Optional
 import torch
-from torchvision.transforms.v2 import functional as F
+import torchvision.transforms.v2.functional as tvF
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict
@@ -94,7 +94,7 @@ class Siglip2ImageProcessorFast(BaseImageProcessorFast):
         do_resize: bool,
         patch_size: int,
         max_num_patches: int,
-        interpolation: Optional["F.InterpolationMode"],
+        interpolation: Optional["tvF.InterpolationMode"],
         do_rescale: bool,
         rescale_factor: float,
         do_normalize: bool,

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl