PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/models/pe_audio/modular_pe_audio.py ADDED Viewed

@@ -0,0 +1,299 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ... import initialization as init
+from ...configuration_utils import PreTrainedConfig
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_outputs import BaseModelOutputWithPooling, MaskedLMOutput
+from ...utils import ModelOutput, auto_docstring, can_return_tuple
+from ...utils.generic import check_model_inputs
+from ..auto import AutoModel
+from ..dac.modeling_dac import DacEncoder, DacEncoderBlock, Snake1d
+from ..pe_audio_video.modeling_pe_audio_video import (
+    PeAudioVideoContrastiveHead,
+    PeAudioVideoEncoder,
+    PeAudioVideoPreTrainedModel,
+)
+from .configuration_pe_audio import PeAudioConfig, PeAudioEncoderConfig
+class PeAudioDacEncoderBlock(DacEncoderBlock):
+    def __init__(self, config: PreTrainedConfig, stride: int = 1, stride_index: int = 1):
+        super().__init__(config, stride=stride, stride_index=stride_index)
+class PeAudioDacEncoder(DacEncoder):
+    def __init__(self, config: PreTrainedConfig):
+        super().__init__(config)
+class PeAudioEncoderEmbedder(nn.Module):
+    def __init__(self, config: PeAudioEncoderConfig):
+        super().__init__()
+        self.dac_encoder = PeAudioDacEncoder(config.dac_config)
+        self.bottleneck = nn.Conv1d(config.dac_config.hidden_size, config.dac_config.codebook_dim, 1)
+        self.data_proj = nn.Linear(config.dac_config.codebook_dim, config.hidden_size)
+        self.config = config
+    def forward(
+        self,
+        input_values: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        with torch.no_grad(), torch.backends.cudnn.flags(enabled=False):
+            hidden_states = self.dac_encoder(input_values)
+            hidden_states = self.bottleneck(hidden_states)
+        codec_features = hidden_states.transpose(1, 2)
+        inputs_embeds = self.data_proj(codec_features)
+        if padding_mask is not None:
+            padding_mask = padding_mask[:, :: self.config.dac_config.hop_length]
+        return inputs_embeds, padding_mask
+class PeAudioContrastiveHead(PeAudioVideoContrastiveHead): ...
+class PeAudioPreTrainedModel(PeAudioVideoPreTrainedModel):
+    base_model_prefix = "audio_model"
+    @torch.no_grad()
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, nn.Conv1d):
+            init.trunc_normal_(module.weight, std=0.02)
+            init.constant_(module.bias, 0)
+        elif isinstance(module, Snake1d):
+            init.ones_(module.alpha)
+        elif isinstance(module, nn.ConvTranspose1d):
+            module.reset_parameters()
+        elif isinstance(module, nn.Embedding):
+            init.normal_(module.weight, mean=0.0, std=0.02)
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class for outputs of [`PeAudioEncoder`].
+    """
+)
+class PeAudioEncoderOutput(BaseModelOutputWithPooling):
+    codec_features: Optional[torch.FloatTensor] = None
+    output_mask: Optional[tuple[torch.FloatTensor]] = None
+# TODO: add the capture of codec features?
+@auto_docstring(
+    custom_intro="""
+    The PeAudio Encoder model.
+    """
+)
+class PeAudioEncoder(PeAudioVideoEncoder):
+    base_model_prefix = "audio_model.audio_encoder"
+    @can_return_tuple
+    @check_model_inputs
+    def forward(
+        self,
+        input_values: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> BaseModelOutputWithPooling:
+        inputs_embeds, padding_mask = self.embedder(input_values, padding_mask=padding_mask)
+        inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask)
+        if attention_mask is not None:
+            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+        position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0)
+        position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        hidden_states = self.output(hidden_states)
+        return PeAudioEncoderOutput(
+            last_hidden_state=hidden_states[:, 1:],
+            pooler_output=hidden_states[:, 0],
+            output_mask=padding_mask,
+        )
+# TODO: not sure about the typing for text_model_output
+@dataclass
+# @auto_docstring
+class PeAudioOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits_audio_text: Optional[torch.FloatTensor] = None
+    text_audio_embeds: Optional[torch.FloatTensor] = None
+    audio_embeds: Optional[torch.FloatTensor] = None
+    text_outputs: BaseModelOutputWithPooling = None
+    audio_outputs: BaseModelOutputWithPooling = None
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_outputs", "audio_outputs"] else getattr(self, k).to_tuple() for k in self.keys()
+        )
+class PeAudioModel(PeAudioPreTrainedModel):
+    def __init__(self, config: PeAudioConfig):
+        super().__init__(config)
+        self.text_model = AutoModel.from_config(config.text_config)
+        self.audio_encoder = PeAudioEncoder(config.audio_config)
+        self.text_audio_head = PeAudioContrastiveHead(config.text_config.hidden_size, config.text_config.hidden_size)
+        self.audio_head = PeAudioContrastiveHead(config.audio_config.hidden_size, config.text_config.hidden_size)
+        self.text_audio_logit_scale = nn.Parameter(torch.zeros(1))
+        self.text_audio_logit_bias = nn.Parameter(torch.zeros(1))
+        self.post_init()
+    def get_text_audio_embeds(self, input_ids, attention_mask=None):
+        # TODO: naming can be improved here...
+        text_outputs: MaskedLMOutput = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=True,
+        )
+        text_audio_embeds = text_outputs.hidden_states[-1][:, 0]
+        return self.text_audio_head(text_audio_embeds)
+    def get_audio_embeds(self, input_values, padding_mask=None):
+        audio_outputs: BaseModelOutputWithPooling = self.audio_encoder(
+            input_values=input_values,
+            padding_mask=padding_mask,
+            return_dict=True,
+        )
+        audio_embeds = audio_outputs.pooler_output
+        return self.audio_head(audio_embeds)
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        input_values: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        return_loss: Optional[bool] = None,
+        **kwargs,
+    ) -> PeAudioOutput:
+        audio_outputs: BaseModelOutputWithPooling = self.audio_encoder(
+            input_values=input_values, padding_mask=padding_mask, **kwargs
+        )
+        kwargs["output_hidden_states"] = True
+        text_outputs: MaskedLMOutput = self.text_model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
+        audio_embeds = audio_outputs.pooler_output
+        audio_embeds = self.audio_head(audio_embeds)
+        text_audio_embeds = text_outputs.hidden_states[-1][:, 0]
+        text_audio_embeds = self.text_audio_head(text_audio_embeds)
+        logits_audio_text = audio_embeds @ text_audio_embeds.T
+        logits_audio_text = logits_audio_text * self.text_audio_logit_scale + self.text_audio_logit_bias
+        loss = None
+        if return_loss:
+            labels = torch.eye(logits_audio_text.shape[0], device=logits_audio_text.device)
+            loss = -F.logsigmoid(labels * logits_audio_text).sum() / logits_audio_text.shape[0]
+        return PeAudioOutput(
+            logits_audio_text=logits_audio_text,
+            text_audio_embeds=text_audio_embeds,
+            audio_embeds=audio_embeds,
+            text_outputs=text_outputs,
+            audio_outputs=audio_outputs,
+            loss=loss,
+        )
+# TODO: underline in documentation that logits output shape is
+# 1. Model: (n_audio, n_text)
+# 2. Frame-level: (n_audio, n_text, n_frames)
+class PeAudioFrameLevelModel(PeAudioModel):
+    def get_audio_embeds(self, input_values, padding_mask=None):
+        audio_outputs: BaseModelOutputWithPooling = self.audio_encoder(
+            input_values=input_values,
+            padding_mask=padding_mask,
+            return_dict=True,
+        )
+        audio_embeds = audio_outputs.last_hidden_state
+        audio_embeds = self.audio_head(audio_embeds)
+        return audio_embeds
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        input_values: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        return_loss: Optional[bool] = None,
+        **kwargs,
+    ) -> PeAudioOutput:
+        audio_outputs: BaseModelOutputWithPooling = self.audio_encoder(
+            input_values=input_values, padding_mask=padding_mask, **kwargs
+        )
+        kwargs["output_hidden_states"] = True
+        text_outputs: MaskedLMOutput = self.text_model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
+        audio_embeds = audio_outputs.last_hidden_state
+        audio_embeds = self.audio_head(audio_embeds)
+        text_audio_embeds = text_outputs.hidden_states[-1][:, 0]
+        text_audio_embeds = self.text_audio_head(text_audio_embeds)
+        logits_audio_text = (audio_embeds @ text_audio_embeds.T).transpose(1, 2)
+        logits_audio_text = logits_audio_text * self.text_audio_logit_scale + self.text_audio_logit_bias
+        loss = None
+        if return_loss:
+            labels = torch.eye(logits_audio_text.shape[0], device=logits_audio_text.device)
+            loss = -F.logsigmoid(labels * logits_audio_text).sum() / logits_audio_text.shape[0]
+        return PeAudioOutput(
+            logits_audio_text=logits_audio_text,
+            text_audio_embeds=text_audio_embeds,
+            audio_embeds=audio_embeds,
+            text_outputs=text_outputs,
+            audio_outputs=audio_outputs,
+            loss=loss,
+        )
+__all__ = [
+    "PeAudioFrameLevelModel",
+    "PeAudioModel",
+    "PeAudioEncoder",
+]

transformers/models/pe_audio/processing_pe_audio.py ADDED Viewed

@@ -0,0 +1,24 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...processing_utils import ProcessorMixin
+class PeAudioProcessor(ProcessorMixin):
+    attributes = ["feature_extractor", "tokenizer"]
+    feature_extractor_class = "PeAudioFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+__all__ = ["PeAudioProcessor"]

transformers/models/pe_audio_video/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_pe_audio_video import *
+    from .modeling_pe_audio_video import *
+    from .processing_pe_audio_video import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

transformers/models/pe_audio_video/configuration_pe_audio_video.py ADDED Viewed

@@ -0,0 +1,225 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union
+from ...configuration_utils import PreTrainedConfig, PretrainedConfig
+from ...modeling_rope_utils import RopeParameters
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+logger = logging.get_logger(__name__)
+class PeAudioVideoEncoderConfig(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PeAudioVideoEncoderModel`]. It is used to instantiate a
+    PeAudioVideoEncoder model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of pe-av-large.
+    e.g. [facebook/pe-av-large](https://huggingface.co/facebook/pe-av-large)
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+    Args:
+        audio_config (`Union[PreTrainedConfig, dict]`, *optional*):
+            Configuration for the audio encoder. If a dictionary is provided, it is used to instantiate
+            [`~transformers.PeAudioEncoderConfig`].
+        video_config (`Union[PreTrainedConfig, dict]`, *optional*):
+            Configuration for the video encoder. If a dictionary is provided, it is used to instantiate
+            [`~transformers.PeVideoEncoderConfig`].
+        hidden_size (`int`, *optional*, defaults to 1792):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 4800):
+            Dimension of the feedforward layers in the Transformer blocks.
+        num_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of Transformer encoder blocks.
+        num_attention_heads (`int`, *optional*, defaults to 14):
+            Number of attention heads used in each attention layer.
+        num_key_value_heads (`int`, *optional*):
+            Number of key and value heads for grouped-query attention. If unset, this defaults to `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each attention head for query, key, and value projections.
+        hidden_act (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the Transformer blocks.
+        max_position_embeddings (`int`, *optional*, defaults to 10000):
+            Maximum sequence length supported by the rotary position embeddings.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            Standard deviation of the truncated normal initializer for weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            Epsilon used by the RMS normalization layers.
+        rope_parameters (`Union[RopeParameters, dict]`, *optional*, defaults to `{'rope_theta': 20000}`):
+            Parameters for the rotary position embeddings, such as the base `rope_theta`.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias terms in the query, key, value, and output projections.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout ratio applied to attention probabilities.
+    ```python
+    >>> from transformers import PeAudioVideoEncoder, PeAudioVideoEncoderConfig
+    >>> # Initializing a PeAudioVideoEncoder style configuration
+    >>> configuration = PeAudioVideoEncoderConfig()
+    >>> # Initializing a model from the pe-av-large style configuration
+    >>> model = PeAudioVideoEncoder(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "pe_audio_video_encoder"
+    base_config_key = "audio_video_config"
+    sub_configs = {"audio_config": AutoConfig, "video_config": AutoConfig}
+    def __init__(
+        self,
+        audio_config: Optional[Union[dict, PreTrainedConfig]] = None,
+        video_config: Optional[Union[dict, PreTrainedConfig]] = None,
+        hidden_size: Optional[int] = 1792,
+        intermediate_size: Optional[int] = 4800,
+        num_hidden_layers: Optional[int] = 6,
+        num_attention_heads: Optional[int] = 14,
+        num_key_value_heads: Optional[int] = None,
+        head_dim: Optional[int] = 128,
+        hidden_act: Optional[str] = "silu",
+        max_position_embeddings: Optional[int] = 10000,
+        initializer_range: Optional[float] = 0.02,
+        rms_norm_eps: Optional[float] = 1e-5,
+        rope_parameters: Optional[Union[RopeParameters, dict]] = {"rope_theta": 20000},
+        attention_bias: Optional[bool] = False,
+        attention_dropout: Optional[float] = 0.0,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_parameters = rope_parameters
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        if isinstance(audio_config, dict):
+            audio_config["model_type"] = audio_config.get("model_type", "pe_audio_encoder")
+            audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config)
+        elif audio_config is None:
+            audio_config = CONFIG_MAPPING["pe_audio_encoder"]()
+        if isinstance(video_config, dict):
+            video_config["model_type"] = video_config.get("model_type", "pe_video_encoder")
+            video_config = CONFIG_MAPPING[video_config["model_type"]](**video_config)
+        elif video_config is None:
+            video_config = CONFIG_MAPPING["pe_video_encoder"]()
+        self.audio_config = audio_config
+        self.video_config = video_config
+        super().__init__(**kwargs)
+class PeAudioVideoConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PeAudioVideoModel`]. It is used to instantiate a
+    PeAudioVideoModel model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of pe-av-large.
+    e.g. [facebook/pe-av-large](https://huggingface.co/facebook/pe-av-large)
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+    Args:
+        text_config (`dict` or `PreTrainedConfig`, *optional*):
+            Configuration for the text model component.
+        audio_video_config (`dict` or `PreTrainedConfig`, *optional*):
+            Configuration for the audio-video encoder component.
+    ```python
+    >>> from transformers import PeAudioVideoModel, PeAudioVideoConfig
+    >>> # Initializing a PeAudioVideoModel style configuration
+    >>> configuration = PeAudioVideoConfig()
+    >>> # Initializing a model from the pe-av-large style configuration
+    >>> model = PeAudioModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "pe_audio_video"
+    sub_configs = {"text_config": AutoConfig, "audio_video_config": PeAudioVideoEncoderConfig}
+    _default_text_config_kwargs = {
+        "model_type": "modernbert",
+        "hidden_size": 1024,
+        "intermediate_size": 2624,
+        "num_hidden_layers": 22,
+        "num_attention_heads": 16,
+    }
+    def __init__(
+        self,
+        text_config=None,
+        audio_video_config=None,
+        **kwargs,
+    ):
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "modernbert")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](
+                **{**self._default_text_config_kwargs, **text_config}
+            )
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["modernbert"](**self._default_text_config_kwargs)
+        if isinstance(audio_video_config, dict):
+            audio_video_config = PeAudioVideoEncoderConfig(**audio_video_config)
+        elif audio_video_config is None:
+            audio_video_config = PeAudioVideoEncoderConfig()
+        self.text_config = text_config
+        self.audio_video_config = audio_video_config
+        super().__init__(**kwargs)
+    @property
+    def audio_config(self):
+        return CONFIG_MAPPING["pe_audio"](
+            text_config=self.text_config,
+            audio_config=self.audio_video_config.audio_config,
+        )
+    @property
+    def video_config(self):
+        return CONFIG_MAPPING["pe_video"](
+            text_config=self.text_config,
+            video_config=self.audio_video_config.video_config,
+        )
+__all__ = ["PeAudioVideoEncoderConfig", "PeAudioVideoConfig"]

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl