PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py CHANGED Viewed

@@ -18,13 +18,9 @@
 # limitations under the License.
+from ...backbone_utils import consolidate_backbone_kwargs_to_config
 from ...configuration_utils import PreTrainedConfig
-from ...utils import logging
-from ...utils.backbone_utils import verify_backbone_config_arguments
-from ..auto.configuration_auto import CONFIG_MAPPING, AutoConfig
-logger = logging.get_logger(__name__)
+from ..auto.configuration_auto import AutoConfig
 class PromptDepthAnythingConfig(PreTrainedConfig):
@@ -40,18 +36,6 @@ class PromptDepthAnythingConfig(PreTrainedConfig):
     Args:
         backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `Dinov2Config()`):
             The configuration of the backbone model.
-        backbone (`str`, *optional*):
-            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
-            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
-            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
-        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, defaults to `False`):
-            Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
-            API.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         patch_size (`int`, *optional*, defaults to 14):
             The size of the patches to extract from the backbone features.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -95,10 +79,6 @@ class PromptDepthAnythingConfig(PreTrainedConfig):
     def __init__(
         self,
         backbone_config=None,
-        backbone=None,
-        use_pretrained_backbone=False,
-        use_timm_backbone=False,
-        backbone_kwargs=None,
         patch_size=14,
         initializer_range=0.02,
         reassemble_hidden_size=384,
@@ -111,34 +91,20 @@ class PromptDepthAnythingConfig(PreTrainedConfig):
         max_depth=None,
         **kwargs,
     ):
-        if backbone_config is None and backbone is None:
-            logger.info("`backbone_config` is `None`. Initializing the config with the default `Dinov2` backbone.")
-            backbone_config = CONFIG_MAPPING["dinov2"](
-                image_size=518,
-                hidden_size=384,
-                num_attention_heads=6,
-                out_indices=[9, 10, 11, 12],
-                apply_layernorm=True,
-                reshape_hidden_states=False,
-            )
-        elif isinstance(backbone_config, dict):
-            backbone_model_type = backbone_config.get("model_type")
-            config_class = CONFIG_MAPPING[backbone_model_type]
-            backbone_config = config_class.from_dict(backbone_config)
-        verify_backbone_config_arguments(
-            use_timm_backbone=use_timm_backbone,
-            use_pretrained_backbone=use_pretrained_backbone,
-            backbone=backbone,
+        backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
             backbone_config=backbone_config,
-            backbone_kwargs=backbone_kwargs,
+            default_config_type="dinov2",
+            default_config_kwargs={
+                "image_size": 518,
+                "hidden_size": 384,
+                "num_attention_heads": 6,
+                "out_indices": [9, 10, 11, 12],
+                "reshape_hidden_states": False,
+            },
+            **kwargs,
         )
         self.backbone_config = backbone_config
-        self.backbone = backbone
-        self.use_pretrained_backbone = use_pretrained_backbone
-        self.use_timm_backbone = use_timm_backbone
-        self.backbone_kwargs = backbone_kwargs
         self.reassemble_hidden_size = reassemble_hidden_size
         self.patch_size = patch_size
         self.initializer_range = initializer_range

transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py CHANGED Viewed

@@ -23,7 +23,7 @@ from ...processing_utils import Unpack
 if TYPE_CHECKING:
     from ...modeling_outputs import DepthEstimatorOutput
 import torch
-from torchvision.transforms.v2 import functional as F
+import torchvision.transforms.v2.functional as tvF
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
@@ -128,14 +128,14 @@ class PromptDepthAnythingImageProcessorFast(BaseImageProcessorFast):
         size: SizeDict,
         keep_aspect_ratio: bool = False,
         ensure_multiple_of: int = 1,
-        interpolation: Optional["F.InterpolationMode"] = None,
+        interpolation: Optional["tvF.InterpolationMode"] = None,
     ) -> "torch.Tensor":
         """
         Resize an image to target size while optionally maintaining aspect ratio and ensuring dimensions are multiples.
         """
         # Set default interpolation to BICUBIC to match the slow processor (causes slight numerical differences otherwise)
         if interpolation is None:
-            interpolation = F.InterpolationMode.BICUBIC
+            interpolation = tvF.InterpolationMode.BICUBIC
         # Custom resize with aspect ratio preservation and ensure_multiple_of constraint
         output_size = _get_resize_output_image_size(
@@ -175,11 +175,11 @@ class PromptDepthAnythingImageProcessorFast(BaseImageProcessorFast):
         pad_size_top, pad_size_bottom = _get_pad(height, size_divisor)
         # Use torchvision padding for fast processing
-        # /!\ NB: torchvision F.pad expects (left, top, right, bottom) for the last two dims (W then H)
+        # /!\ NB: torchvision tvF.pad expects (left, top, right, bottom) for the last two dims (W then H)
         # Source: https://docs.pytorch.org/vision/main/generated/torchvision.transforms.Pad.html
         # So: (left=width_pad, top=height_pad, right=width_pad, bottom=height_pad)
         padding = [pad_size_left, pad_size_top, pad_size_right, pad_size_bottom]
-        padded_image = F.pad(image, padding=padding)
+        padded_image = tvF.pad(image, padding=padding)
         return padded_image
@@ -249,7 +249,7 @@ class PromptDepthAnythingImageProcessorFast(BaseImageProcessorFast):
         do_resize: bool,
         size: SizeDict,
         keep_aspect_ratio: bool | None,
-        interpolation: Optional["F.InterpolationMode"],
+        interpolation: Optional["tvF.InterpolationMode"],
         do_rescale: bool,
         rescale_factor: float,
         do_normalize: bool,

transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py CHANGED Viewed

@@ -20,12 +20,11 @@
 import torch
 import torch.nn as nn
-from transformers.utils.generic import torch_int
+from ...backbone_utils import load_backbone
 from ...modeling_outputs import DepthEstimatorOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring
-from ...utils.backbone_utils import load_backbone
+from ...utils.generic import torch_int
 from .configuration_prompt_depth_anything import PromptDepthAnythingConfig
@@ -409,16 +408,19 @@ class PromptDepthAnythingForDepthEstimation(PromptDepthAnythingPreTrainedModel):
         >>> import torch
         >>> import numpy as np
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> url = "https://github.com/DepthAnything/PromptDA/blob/main/assets/example_images/image.jpg?raw=true"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> image_processor = AutoImageProcessor.from_pretrained("depth-anything/prompt-depth-anything-vits-hf")
         >>> model = AutoModelForDepthEstimation.from_pretrained("depth-anything/prompt-depth-anything-vits-hf")
         >>> prompt_depth_url = "https://github.com/DepthAnything/PromptDA/blob/main/assets/example_images/arkit_depth.png?raw=true"
-        >>> prompt_depth = Image.open(requests.get(prompt_depth_url, stream=True).raw)
+        >>> with httpx.stream("GET", prompt_depth_url) as response:
+        ...     prompt_depth = Image.open(BytesIO(response.read()))
         >>> # prepare image for the model
         >>> inputs = image_processor(images=image, return_tensors="pt", prompt_depth=prompt_depth)

transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py CHANGED Viewed

@@ -14,8 +14,12 @@
 import torch
 import torch.nn as nn
-from transformers.models.depth_anything.configuration_depth_anything import DepthAnythingConfig
-from transformers.models.depth_anything.modeling_depth_anything import (
+from ...modeling_outputs import DepthEstimatorOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring
+from ...utils.generic import torch_int
+from ..depth_anything.configuration_depth_anything import DepthAnythingConfig
+from ..depth_anything.modeling_depth_anything import (
     DepthAnythingDepthEstimationHead,
     DepthAnythingFeatureFusionLayer,
     DepthAnythingFeatureFusionStage,
@@ -23,11 +27,6 @@ from transformers.models.depth_anything.modeling_depth_anything import (
     DepthAnythingNeck,
     DepthAnythingReassembleStage,
 )
-from transformers.utils.generic import torch_int
-from ...modeling_outputs import DepthEstimatorOutput
-from ...modeling_utils import PreTrainedModel
-from ...utils import auto_docstring
 class PromptDepthAnythingConfig(DepthAnythingConfig):
@@ -252,16 +251,19 @@ class PromptDepthAnythingForDepthEstimation(DepthAnythingForDepthEstimation):
         >>> import torch
         >>> import numpy as np
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> url = "https://github.com/DepthAnything/PromptDA/blob/main/assets/example_images/image.jpg?raw=true"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> image_processor = AutoImageProcessor.from_pretrained("depth-anything/prompt-depth-anything-vits-hf")
         >>> model = AutoModelForDepthEstimation.from_pretrained("depth-anything/prompt-depth-anything-vits-hf")
         >>> prompt_depth_url = "https://github.com/DepthAnything/PromptDA/blob/main/assets/example_images/arkit_depth.png?raw=true"
-        >>> prompt_depth = Image.open(requests.get(prompt_depth_url, stream=True).raw)
+        >>> with httpx.stream("GET", prompt_depth_url) as response:
+        ...     prompt_depth = Image.open(BytesIO(response.read()))
         >>> # prepare image for the model
         >>> inputs = image_processor(images=image, return_tensors="pt", prompt_depth=prompt_depth)

transformers/models/prophetnet/configuration_prophetnet.py CHANGED Viewed

@@ -126,6 +126,8 @@ class ProphetNetConfig(PreTrainedConfig):
         pad_token_id: int | None = 0,
         bos_token_id: int | None = 1,
         eos_token_id: int | None = 2,
+        is_decoder: bool | None = False,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -153,16 +155,15 @@ class ProphetNetConfig(PreTrainedConfig):
         self.dropout = dropout
         self.use_cache = use_cache
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            is_encoder_decoder=is_encoder_decoder,
-            add_cross_attention=add_cross_attention,
-            decoder_start_token_id=decoder_start_token_id,
-            **kwargs,
-        )
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.add_cross_attention = add_cross_attention
+        self.decoder_start_token_id = decoder_start_token_id
+        self.is_decoder = is_decoder
+        self.tie_word_embeddings = tie_word_embeddings
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
     @property
     def num_hidden_layers(self) -> int:

transformers/models/prophetnet/modeling_prophetnet.py CHANGED Viewed

@@ -1830,29 +1830,18 @@ class ProphetNetForCausalLM(ProphetNetPreTrainedModel, GenerationMixin):
         is_first_iteration=False,
         **kwargs,
     ):
-        # Overwritten -- our tests complain if we use GenerationMixin.prepare_inputs_for_generation
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-        if past_key_values is not None and not is_first_iteration:
-            input_ids = input_ids[:, -1:]
-        # first step, decoder_cached_states are empty
-        model_inputs = {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
-        # Prophetnet does not support cache_position
-        kwargs.pop("cache_position", None)
-        # Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
-        for key, value in kwargs.items():
-            if key not in model_inputs:
-                model_inputs[key] = value
+        # Overwritten -- Prophetnet does not support cache_position
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            use_cache=use_cache,
+            is_first_iteration=is_first_iteration,
+            **kwargs,
+        )
+        model_inputs.pop("cache_position", None)
         return model_inputs

transformers/models/pvt/image_processing_pvt.py CHANGED Viewed

@@ -47,7 +47,7 @@ class PvtImageProcessor(BaseImageProcessor):
         size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
             Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
@@ -73,7 +73,7 @@ class PvtImageProcessor(BaseImageProcessor):
         self,
         do_resize: bool = True,
         size: dict[str, int] | None = None,
-        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
         do_rescale: bool = True,
         rescale_factor: int | float = 1 / 255,
         do_normalize: bool = True,
@@ -93,12 +93,11 @@ class PvtImageProcessor(BaseImageProcessor):
         self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
-    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
         size: dict[str, int],
-        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: str | ChannelDimension | None = None,
         input_data_format: str | ChannelDimension | None = None,
         **kwargs,
@@ -111,8 +110,8 @@ class PvtImageProcessor(BaseImageProcessor):
                 Image to resize.
             size (`dict[str, int]`):
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
-            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
             data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format for the output image. If unset, the channel dimension format of the input
                 image is used. Can be one of:
@@ -129,6 +128,7 @@ class PvtImageProcessor(BaseImageProcessor):
         Returns:
             `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
@@ -171,7 +171,7 @@ class PvtImageProcessor(BaseImageProcessor):
                 Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
                 resizing.
             resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
-                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BICUBIC`. Only has
                 an effect if `do_resize` is set to `True`.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                 Whether to rescale the image values between [0 - 1].

transformers/models/pvt/image_processing_pvt_fast.py CHANGED Viewed

@@ -20,7 +20,7 @@ from ...utils import auto_docstring
 @auto_docstring
 class PvtImageProcessorFast(BaseImageProcessorFast):
-    resample = PILImageResampling.BILINEAR
+    resample = PILImageResampling.BICUBIC
     image_mean = IMAGENET_DEFAULT_MEAN
     image_std = IMAGENET_DEFAULT_STD
     size = {"height": 224, "width": 224}

transformers/models/pvt_v2/configuration_pvt_v2.py CHANGED Viewed

@@ -17,9 +17,9 @@
 from collections.abc import Callable
+from ...backbone_utils import BackboneConfigMixin
 from ...configuration_utils import PreTrainedConfig
 from ...utils import logging
-from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 logger = logging.get_logger(__name__)
@@ -147,9 +147,7 @@ class PvtV2Config(BackboneConfigMixin, PreTrainedConfig):
         self.qkv_bias = qkv_bias
         self.linear_attention = linear_attention
         self.stage_names = [f"stage{idx}" for idx in range(1, len(depths) + 1)]
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
-        )
+        self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features)
 __all__ = ["PvtV2Config"]

transformers/models/pvt_v2/modeling_pvt_v2.py CHANGED Viewed

@@ -22,11 +22,11 @@ from torch import nn
 from ... import initialization as init
 from ...activations import ACT2FN
+from ...backbone_utils import BackboneMixin
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BackboneOutput, BaseModelOutput, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, logging
-from ...utils.backbone_utils import BackboneMixin
 from .configuration_pvt_v2 import PvtV2Config
@@ -510,10 +510,9 @@ class PvtV2ForImageClassification(PvtV2PreTrainedModel):
     PVTv2 backbone, to be used with frameworks like DETR and MaskFormer.
     """
 )
-class PvtV2Backbone(PvtV2Model, BackboneMixin):
+class PvtV2Backbone(BackboneMixin, PvtV2Model):
     def __init__(self, config: PvtV2Config):
         super().__init__(config)
-        super()._init_backbone(config)
         self.num_features = config.hidden_sizes
     @auto_docstring
@@ -532,10 +531,12 @@ class PvtV2Backbone(PvtV2Model, BackboneMixin):
         >>> from transformers import AutoImageProcessor, AutoBackbone
         >>> import torch
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> processor = AutoImageProcessor.from_pretrained("OpenGVLab/pvt_v2_b0")
         >>> model = AutoBackbone.from_pretrained(

transformers/models/qwen2/configuration_qwen2.py CHANGED Viewed

@@ -79,6 +79,12 @@ class Qwen2Config(PreTrainedConfig):
             Attention pattern for each layer.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*):
+            End of stream token id.
     ```python
     >>> from transformers import Qwen2Model, Qwen2Config
@@ -132,6 +138,9 @@ class Qwen2Config(PreTrainedConfig):
         max_window_layers: int | None = 28,
         layer_types: list[str] | None = None,
         attention_dropout: float | None = 0.0,
+        pad_token_id: int | None = None,
+        bos_token_id: int | None = None,
+        eos_token_id: int | None = None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -165,12 +174,13 @@ class Qwen2Config(PreTrainedConfig):
             ]
         layer_type_validation(self.layer_types, self.num_hidden_layers)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         self.rope_parameters = rope_parameters
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+        super().__init__(**kwargs)
 __all__ = ["Qwen2Config"]

transformers/models/qwen2/modeling_qwen2.py CHANGED Viewed

@@ -227,9 +227,9 @@ class Qwen2Attention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -428,7 +428,7 @@ class Qwen2Model(Qwen2PreTrainedModel):
 @auto_docstring
 class Qwen2ForCausalLM(Qwen2PreTrainedModel, GenerationMixin):
     _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
-    _tp_plan = {"lm_head": "colwise_rep"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     def __init__(self, config):

transformers/models/qwen2/modular_qwen2.py CHANGED Viewed

@@ -82,9 +82,9 @@ class Qwen2Attention(LlamaAttention):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,

transformers/models/qwen2/tokenization_qwen2.py CHANGED Viewed

@@ -42,8 +42,6 @@ class Qwen2Tokenizer(TokenizersBackend):
         self,
         vocab: str | dict[str, int] | None = None,
         merges: str | list[str] | None = None,
-        vocab_file=None,
-        merges_file=None,
         unk_token: str = "<|endoftext|>",
         bos_token=None,
         eos_token: str = "<|endoftext|>",
@@ -89,8 +87,6 @@ class Qwen2Tokenizer(TokenizersBackend):
         )
         super().__init__(
-            vocab_file=vocab_file,
-            merges_file=merges_file,
             unk_token=unk_token,
             bos_token=bos_token,
             eos_token=eos_token,

transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py CHANGED Viewed

@@ -319,13 +319,15 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig):
         initializer_range: float | None = 0.02,
         rms_norm_eps: int | None = 1e-6,
         use_cache: bool | None = True,
-        tie_word_embeddings: bool | None = False,
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
         use_sliding_window: bool | None = False,
         sliding_window: int | None = 32768,
         max_window_layers: int | None = 28,
         layer_types: list[str] | None = None,
         attention_dropout: float | None = 0.0,
+        pad_token_id: int | None = None,
+        bos_token_id: int | None = None,
+        eos_token_id: int | None = None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -337,6 +339,9 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig):
         self.use_sliding_window = use_sliding_window
         self.sliding_window = sliding_window if self.use_sliding_window else None
         self.max_window_layers = max_window_layers
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         # for backward compatibility
         if num_key_value_heads is None:
@@ -361,7 +366,6 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig):
         self.rope_parameters = rope_parameters
         super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
             ignore_keys_at_rope_validation={"mrope_section"},
             **kwargs,
         )
@@ -403,6 +407,8 @@ class Qwen2_5OmniThinkerConfig(PreTrainedConfig):
             The user token index to encode the user token.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
     Example:
@@ -454,6 +460,7 @@ class Qwen2_5OmniThinkerConfig(PreTrainedConfig):
         audio_end_token_id=151648,
         user_token_id=872,
         initializer_range=0.02,
+        tie_word_embeddings=False,
         **kwargs,
     ):
         self.audio_token_index = audio_token_index
@@ -465,6 +472,7 @@ class Qwen2_5OmniThinkerConfig(PreTrainedConfig):
         self.audio_start_token_id = audio_start_token_id
         self.audio_end_token_id = audio_end_token_id
         self.initializer_range = initializer_range
+        self.tie_word_embeddings = tie_word_embeddings
         if isinstance(vision_config, dict):
             vision_config = Qwen2_5OmniVisionEncoderConfig(**vision_config)
@@ -583,6 +591,9 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig):
             The size used for merging spatial dimensions.
         layer_types (`list`, *optional*):
             Attention pattern for each layer.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
     Example:
@@ -652,6 +663,7 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig):
         initializer_range=0.02,
         spatial_merge_size=2,
         layer_types=None,
+        pad_token_id: int | None = None,
         **kwargs,
     ):
         self.audio_token_index = audio_token_index
@@ -695,9 +707,11 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig):
         self.seconds_per_chunk = seconds_per_chunk  # zf
         self.audio_start_token_id = audio_start_token_id  # zf
         self.audio_end_token_id = audio_end_token_id  # zf
+        self.pad_token_id = pad_token_id
         self.initializer_range = initializer_range
         self.spatial_merge_size = spatial_merge_size
+        self.tie_word_embeddings = tie_word_embeddings
         self.layer_types = layer_types
         if self.layer_types is None:
@@ -710,9 +724,7 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig):
         layer_type_validation(self.layer_types, self.num_hidden_layers)
         self.rope_parameters = rope_parameters
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
-        )
+        super().__init__(ignore_keys_at_rope_validation={"mrope_section"}, **kwargs)
 class Qwen2_5OmniDiTConfig(PreTrainedConfig):

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl