PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/models/paddleocr_vl/modular_paddleocr_vl.py CHANGED Viewed

@@ -25,6 +25,7 @@ import torch
 import torch.nn.functional as F
 from torch import nn
+from ... import initialization as init
 from ...activations import GELUActivation
 from ...cache_utils import Cache, DynamicCache
 from ...image_processing_utils import BatchFeature
@@ -776,6 +777,14 @@ class PaddleOCRVLPreTrainedModel(PreTrainedModel):
         "attentions": PaddleOCRAttention,
     }
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, PaddleOCRVisionEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+        elif isinstance(module, PaddleOCRVisionRotaryEmbedding):
+            inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
+            init.copy_(module.inv_freq, inv_freq)
 class PaddleOCRTextModel(PaddleOCRVLPreTrainedModel, Ernie4_5Model):
     def __init__(self, config: PaddleOCRTextConfig):
@@ -977,18 +986,17 @@ class PaddleOCRVisionEncoder(VideoLlama3VisionEncoder):
         attention_mask: Optional[torch.Tensor] = None,
         image_grid_thw: Optional[list[Union[tuple[int, int, int], list[tuple[int, int, int]]]]] = None,
     ) -> BaseModelOutput:
-        """
-        Args:
-            inputs_embeds (`torch.FloatTensor` of shape `(sequence_length, hidden_size)`, *optional*):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            cu_seqlens (`torch.Tensor` of shape `(num_images + 1,)`):
-                The cumulative sequence lengths of each image or video feature.
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                The attention_mask used in forward function shape [batch_size X sequence_length] if not None.
-            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
-                The temporal, height and width of feature shape of each image in LLM.
+        r"""
+        inputs_embeds (`torch.FloatTensor` of shape `(sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        cu_seqlens (`torch.Tensor` of shape `(num_images + 1,)`):
+            The cumulative sequence lengths of each image or video feature.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            The attention_mask used in forward function shape [batch_size X sequence_length] if not None.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
         """
         device = inputs_embeds.device
         hidden_states = inputs_embeds
@@ -1037,6 +1045,8 @@ class PaddleOCRVisionTransformer(PaddleOCRVLPreTrainedModel):
         self.encoder = PaddleOCRVisionEncoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.post_init()
     def forward(
         self,
         pixel_values: torch.FloatTensor,

transformers/models/paligemma/modeling_paligemma.py CHANGED Viewed

@@ -149,7 +149,8 @@ def create_causal_mask_mapping(
     position_ids: Optional[torch.Tensor],
     token_type_ids: Optional[torch.Tensor] = None,
     pixel_values: Optional[torch.FloatTensor] = None,
-    is_training: bool = False,
+    is_training: Optional[bool] = False,
+    is_first_iteration: Optional[bool] = None,
     **kwargs,
 ) -> dict:
     """
@@ -169,31 +170,33 @@ def create_causal_mask_mapping(
         "past_key_values": past_key_values,
         "position_ids": position_ids,
     }
-    # NOTE: this `is_prompt` logic is not flawless, it fails when we're using a cache eagerly initialized
-    # (e.g. compiled prefill) AND `pixel_values` are not provided (i.e. the image data is provided through other
-    # means). Determining prefill in that case requires checking data values, which is not compile-compatible.
-    maybe_is_prompt = past_key_values is None or not past_key_values.is_initialized or pixel_values is not None
-    if maybe_is_prompt:
+    # Infer if prefill or decoding stage, if the flag isn't passed. This happens only when the mask is constructed
+    # from `forward` call. If users run a `forward` call, we have no option to infer `is_first_iteration` because users may be
+    # running generation with custom loop. Thus we need to infer it in a `non-perfect` way
+    # NOTE: Determining prefill in that case requires checking data values, which is not compile-compatible.
+    is_first_iteration = (
+        is_first_iteration
+        if is_first_iteration
+        else (past_key_values is None or not past_key_values.is_initialized or pixel_values is not None)
+    )
+    if is_first_iteration or not kwargs.get("use_cache", True):
         if token_type_ids is not None:
             # The logic bellow was originally written for Gemma3, where `token_type_ids` is reversed. Let's reverse
             # it to then use exactly the same logic.
             token_type_ids = 1 - token_type_ids
         else:
             logger.warning_once(
-                "The input may be the prompt, but `token_type_ids` is not provided. We recommend "
+                "It is a prefill stage but The `token_type_ids` is not provided. We recommend "
                 "passing `token_type_ids` to the model to prevent bad attention masking."
             )
-            # BC: when NOT training, use bidirectional mask if sequence length > 1. Otherwise, use the default causal
-            # mask. This is incorrect in some advanced use cases, hence the warning above.
             # NOTE: this branch can't be reached when training because `token_type_ids` is required as a model input.
-            if input_embeds.shape[1] > 1:
-                token_type_ids = torch.ones_like(input_embeds)[:, :, 0]
+            token_type_ids = torch.ones_like(input_embeds)[:, :, 0]
     # Logic originally copied from Gemma3. It holds up for Paligemma as well because Paligemma assumes up to one image
     # per prompt AND we reverse `token_type_ids` above. Gemma3 uses a bidirectional mask for images, tagged through
     # `token_type_ids` 1s.
-    if token_type_ids is not None and maybe_is_prompt:
+    if token_type_ids is not None and is_first_iteration:
         # We need to pass an additional mask function to account for token type ids, and it needs to be an `or` (to
         # undo the causal masking)
@@ -550,6 +553,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
         use_cache=True,
         logits_to_keep=None,
         labels=None,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- custom `position_ids` and `pixel_values` handling
@@ -563,6 +567,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
             use_cache=use_cache,
             logits_to_keep=logits_to_keep,
             token_type_ids=token_type_ids,
+            is_first_iteration=is_first_iteration,
             **kwargs,
         )
@@ -570,9 +575,11 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
         if model_inputs.get("position_ids") is not None:
             model_inputs["position_ids"] += 1
-        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-        # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
-        if cache_position[0] == 0:
+        # Pixel values are used only in the first iteration if available
+        # In subsquent iterations, they are already merged with text and cached
+        # NOTE: first iteration doesn't have to be prefill, it can be the first
+        # iteration with a question and cached system prompt (continue generate from cache). NOTE: use_cache=False needs pixel_values always
+        if is_first_iteration or not use_cache:
             model_inputs["pixel_values"] = pixel_values
         return model_inputs
@@ -586,6 +593,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
         past_key_values: Optional[Cache],
         position_ids: Optional[torch.Tensor],
         token_type_ids: Optional[torch.Tensor] = None,
+        is_first_iteration: Optional[bool] = False,
         **kwargs,
     ) -> dict:
         # Uses the overwritten `create_masks_for_generate` with `token_type_ids` masking
@@ -597,7 +605,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
             past_key_values,
             position_ids,
             token_type_ids,
-            pixel_values=kwargs.get("pixel_values"),
+            is_first_iteration=is_first_iteration,
             **{k: v for k, v in kwargs.items() if k != "pixel_values"},
         )

transformers/models/parakeet/modeling_parakeet.py CHANGED Viewed

@@ -510,6 +510,11 @@ class ParakeetPreTrainedModel(PreTrainedModel):
             # Initialize positional bias parameters
             init.normal_(module.bias_u, mean=0.0, std=std)
             init.normal_(module.bias_v, mean=0.0, std=std)
+        elif isinstance(module, ParakeetEncoderRelPositionalEncoding):
+            inv_freq = 1.0 / (
+                10000.0 ** (torch.arange(0, self.config.hidden_size, 2, dtype=torch.int64) / self.config.hidden_size)
+            )
+            init.copy_(module.inv_freq, inv_freq)
     def _get_subsampling_output_length(self, input_lengths: torch.Tensor):
         encoder_config = self.config.encoder_config if isinstance(self.config, ParakeetCTCConfig) else self.config

transformers/models/parakeet/modular_parakeet.py CHANGED Viewed

@@ -346,6 +346,11 @@ class ParakeetPreTrainedModel(PreTrainedModel):
             # Initialize positional bias parameters
             init.normal_(module.bias_u, mean=0.0, std=std)
             init.normal_(module.bias_v, mean=0.0, std=std)
+        elif isinstance(module, ParakeetEncoderRelPositionalEncoding):
+            inv_freq = 1.0 / (
+                10000.0 ** (torch.arange(0, self.config.hidden_size, 2, dtype=torch.int64) / self.config.hidden_size)
+            )
+            init.copy_(module.inv_freq, inv_freq)
     def _get_subsampling_output_length(self, input_lengths: torch.Tensor):
         encoder_config = self.config.encoder_config if isinstance(self.config, ParakeetCTCConfig) else self.config

transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} RENAMED Viewed

@@ -16,10 +16,10 @@
 import itertools
 from typing import Optional, Union
-from ...tokenization_utils_tokenizers import PreTrainedTokenizerFast
+from ...tokenization_utils_tokenizers import TokenizersBackend
-class ParakeetTokenizerFast(PreTrainedTokenizerFast):
+class ParakeetTokenizer(TokenizersBackend):
     """
     Inherits all methods from [`PreTrainedTokenizerFast`]. Users should refer to this superclass for more information regarding those methods,
     except for `_decode` which is overridden to adapt it to CTC decoding:
@@ -51,4 +51,4 @@ class ParakeetTokenizerFast(PreTrainedTokenizerFast):
         )
-__all__ = ["ParakeetTokenizerFast"]
+__all__ = ["ParakeetTokenizer"]

transformers/models/patchtsmixer/modeling_patchtsmixer.py CHANGED Viewed

@@ -696,6 +696,10 @@ class PatchTSMixerPreTrainedModel(PreTrainedModel):
         elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
             init.zeros_(module.bias)
             init.ones_(module.weight)
+            if getattr(module, "running_mean", None) is not None:
+                init.zeros_(module.running_mean)
+                init.ones_(module.running_var)
+                init.zeros_(module.num_batches_tracked)
         elif isinstance(module, PatchTSMixerBatchNorm):
             init.zeros_(module.batchnorm.bias)
             init.ones_(module.batchnorm.weight)

transformers/models/patchtst/modeling_patchtst.py CHANGED Viewed

@@ -584,12 +584,13 @@ class PatchTSTPreTrainedModel(PreTrainedModel):
                         init.copy_(module.position_enc, position_enc)
             else:
                 init.copy_(module.position_enc, position_enc)
-        elif isinstance(module, nn.LayerNorm):
+        elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
             init.zeros_(module.bias)
             init.ones_(module.weight)
-        elif isinstance(module, PatchTSTBatchNorm):
-            init.zeros_(module.batchnorm.bias)
-            init.ones_(module.batchnorm.weight)
+            if getattr(module, "running_mean", None) is not None:
+                init.zeros_(module.running_mean)
+                init.ones_(module.running_var)
+                init.zeros_(module.num_batches_tracked)
         elif isinstance(module, nn.Linear):
             init.normal_(module.weight, mean=0.0, std=self.config.init_std)
             if module.bias is not None:

transformers/models/pe_audio/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_pe_audio import *
+    from .feature_extraction_pe_audio import *
+    from .modeling_pe_audio import *
+    from .processing_pe_audio import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

transformers/models/pe_audio/configuration_pe_audio.py ADDED Viewed

@@ -0,0 +1,206 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union
+from ...configuration_utils import PreTrainedConfig, PretrainedConfig
+from ...modeling_rope_utils import RopeParameters
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+logger = logging.get_logger(__name__)
+class PeAudioEncoderConfig(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PeAudioEncoder`]. It is used to instantiate a
+    PeAudioEncoder model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of pe-av-large.
+    e.g. [facebook/pe-av-large](https://huggingface.co/facebook/pe-av-large)
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+    Args:
+        dac_config (`Union[PreTrainedConfig, dict]`, *optional*):
+            Configuration for the DAC audio encoder used to tokenize the raw audio inputs. If a dictionary is passed, it
+            will be used to instantiate a [`~transformers.DacConfig`] with default DAC hyperparameters.
+        hidden_size (`int`, *optional*, defaults to 1792):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 4800):
+            Dimension of the feedforward layers in the Transformer blocks.
+        num_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of Transformer encoder blocks.
+        num_attention_heads (`int`, *optional*, defaults to 14):
+            Number of attention heads used in each attention layer.
+        num_key_value_heads (`int`, *optional*):
+            Number of key and value heads for grouped-query attention. If unset, this defaults to `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each attention head for query, key, and value projections.
+        hidden_act (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the Transformer blocks.
+        max_position_embeddings (`int`, *optional*, defaults to 10000):
+            Maximum sequence length supported by the rotary position embeddings.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            Standard deviation of the truncated normal initializer for weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            Epsilon used by the RMS normalization layers.
+        rope_parameters (`Union[RopeParameters, dict]`, *optional*, defaults to `{'rope_theta': 20000}`):
+            Parameters for the rotary position embeddings, such as the base `rope_theta`.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias terms in the query, key, value, and output projections.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout ratio applied to attention probabilities.
+    ```python
+    >>> from transformers import PeAudioEncoder, PeAudioEncoderConfig
+    >>> # Initializing a PeAudioEncoder style configuration
+    >>> configuration = PeAudioEncoderConfig()
+    >>> # Initializing a model from the pe-av-large style configuration
+    >>> model = PeAudioEncoder(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "pe_audio_encoder"
+    sub_configs = {"dac_config": AutoConfig}
+    base_config_key = "audio_video_config"
+    _default_dac_config_kwargs = {
+        "downsampling_ratios": [2, 8, 10, 12],
+        "encoder_hidden_size": 64,
+        "codebook_dim": 128,
+    }
+    def __init__(
+        self,
+        dac_config: Optional[Union[dict, PreTrainedConfig]] = None,
+        hidden_size: Optional[int] = 1792,
+        intermediate_size: Optional[int] = 4800,
+        num_hidden_layers: Optional[int] = 6,
+        num_attention_heads: Optional[int] = 14,
+        num_key_value_heads: Optional[int] = None,
+        head_dim: Optional[int] = 128,
+        hidden_act: Optional[str] = "silu",
+        max_position_embeddings: Optional[int] = 10000,
+        initializer_range: Optional[float] = 0.02,
+        rms_norm_eps: Optional[float] = 1e-5,
+        rope_parameters: Optional[Union[RopeParameters, dict]] = {"rope_theta": 20000},
+        attention_bias: Optional[bool] = False,
+        attention_dropout: Optional[float] = 0.0,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_parameters = rope_parameters
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        if isinstance(dac_config, dict):
+            dac_config["model_type"] = dac_config.get("model_type", "dac")
+            dac_config = CONFIG_MAPPING[dac_config["model_type"]](**{**self._default_dac_config_kwargs, **dac_config})
+        elif dac_config is None:
+            dac_config = CONFIG_MAPPING["dac"](**self._default_dac_config_kwargs)
+        self.dac_config = dac_config
+        super().__init__(**kwargs)
+class PeAudioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PeAudioModel`]. It is used to instantiate a
+    PeAudioModel model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of pe-av-large.
+    e.g. [facebook/pe-av-large](https://huggingface.co/facebook/pe-av-large)
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+    Args:
+        text_config (`dict` or `PreTrainedConfig`, *optional*):
+            Configuration for the text model component.
+        audio_config (`dict` or `PreTrainedConfig`, *optional*):
+            Configuration for the audio encoder component.
+    ```python
+    >>> from transformers import PeAudioModel, PeAudioConfig
+    >>> # Initializing a PeAudioModel style configuration
+    >>> configuration = PeAudioConfig()
+    >>> # Initializing a model from the pe-av-large style configuration
+    >>> model = PeAudioModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "pe_audio"
+    sub_configs = {"text_config": AutoConfig, "audio_config": PeAudioEncoderConfig}
+    base_config_key = "audio_video_config"
+    _default_text_config_kwargs = {
+        "model_type": "modernbert",
+        "hidden_size": 1024,
+        "intermediate_size": 2624,
+        "num_hidden_layers": 22,
+        "num_attention_heads": 16,
+    }
+    def __init__(
+        self,
+        text_config=None,
+        audio_config=None,
+        **kwargs,
+    ):
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "modernbert")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](
+                **{**self._default_text_config_kwargs, **text_config}
+            )
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["modernbert"](**self._default_text_config_kwargs)
+        if isinstance(audio_config, dict):
+            audio_config = PeAudioEncoderConfig(**audio_config)
+        elif audio_config is None:
+            audio_config = PeAudioEncoderConfig()
+        self.text_config = text_config
+        self.audio_config = audio_config
+        super().__init__(**kwargs)
+__all__ = ["PeAudioEncoderConfig", "PeAudioConfig"]

transformers/models/pe_audio/feature_extraction_pe_audio.py ADDED Viewed

@@ -0,0 +1,162 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union
+import numpy as np
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import load_audio
+from ...utils import PaddingStrategy, TensorType, logging
+logger = logging.get_logger(__name__)
+class PeAudioFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a PeAudioFeatureExtractor feature extractor.
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
+        sampling_rate (`int`, *optional*, defaults to 48000):
+            The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used for padding.
+        hop_length (`int`, *optional*, defaults to 1920):
+            Overlap length between successive windows.
+    """
+    model_input_names = ["input_values"]
+    def __init__(
+        self,
+        feature_size: int = 1,
+        sampling_rate: int = 48_000,
+        padding_value: float = 0.0,
+        hop_length: int = 1920,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.hop_length = hop_length
+    def _reflect_pad(self, wav):
+        if len(wav) % self.hop_length == 0:
+            return wav
+        p1d = (0, self.hop_length - (len(wav) % self.hop_length))
+        return np.pad(wav, p1d, "reflect")
+    def __call__(
+        self,
+        raw_audio: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]], str, list[str]],
+        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
+        truncation: Optional[bool] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+    ) -> BatchFeature:
+        from_file = False
+        if isinstance(raw_audio, str):
+            raw_audio = [raw_audio]
+        if isinstance(raw_audio, (list, tuple)) and isinstance(raw_audio[0], str):
+            loaded = []
+            for audio_file in raw_audio:
+                loaded.append(load_audio(audio_file, self.sampling_rate))
+            raw_audio = loaded
+            from_file = True
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        elif not from_file:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+        if padding and truncation:
+            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
+        elif padding is None:
+            # by default let's pad the inputs
+            padding = True
+        is_batched = bool(
+            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
+        )
+        if is_batched:
+            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
+        elif not is_batched and not isinstance(raw_audio, np.ndarray):
+            raw_audio = np.asarray(raw_audio, dtype=np.float32)
+        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
+            raw_audio = raw_audio.astype(np.float32)
+        # always return batch
+        if not is_batched:
+            raw_audio = [np.asarray(raw_audio).T]
+        if isinstance(raw_audio, list):
+            raw_audio = [self._reflect_pad(x) for x in raw_audio]
+        else:
+            raw_audio = self._reflect_pad(raw_audio)
+        # verify inputs are valid
+        for example in raw_audio:
+            if example.ndim > 2:
+                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
+            if self.feature_size == 1 and example.ndim != 1:
+                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
+            if self.feature_size == 2:
+                raise ValueError("Stereo audio isn't supported for now")
+        input_values = BatchFeature({"input_values": raw_audio})
+        # normal padding on batch
+        padded_inputs = self.pad(
+            input_values,
+            max_length=max_length,
+            truncation=truncation,
+            padding=padding,
+            return_attention_mask=padding,
+            pad_to_multiple_of=self.hop_length,
+        )
+        if padding:
+            padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
+        if padding:
+            padded_inputs.input_values = padded_inputs.input_values[:, np.newaxis, :]
+        input_values = []
+        for example in padded_inputs.pop("input_values"):
+            if self.feature_size == 1:
+                example = example[..., None]
+            input_values.append(example.T)
+        padded_inputs["input_values"] = input_values
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+        return padded_inputs
+__all__ = ["PeAudioFeatureExtractor"]

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl