PyPI - torchaudio - Versions diffs - 2.8.0__cp310-cp310-win_amd64.whl → 2.9.0__cp310-cp310-win_amd64.whl - Mend

torchaudio 2.8.0__cp310-cp310-win_amd64.whl → 2.9.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show

torchaudio/__init__.py +179 -39
torchaudio/_extension/__init__.py +1 -14
torchaudio/_extension/utils.py +0 -47
torchaudio/_internal/module_utils.py +12 -3
torchaudio/_torchcodec.py +73 -85
torchaudio/datasets/cmuarctic.py +1 -1
torchaudio/datasets/utils.py +1 -1
torchaudio/functional/__init__.py +0 -2
torchaudio/functional/_alignment.py +1 -1
torchaudio/functional/filtering.py +70 -55
torchaudio/functional/functional.py +26 -60
torchaudio/lib/_torchaudio.pyd +0 -0
torchaudio/lib/libtorchaudio.pyd +0 -0
torchaudio/models/decoder/__init__.py +14 -2
torchaudio/models/decoder/_ctc_decoder.py +6 -6
torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
torchaudio/models/squim/objective.py +2 -2
torchaudio/pipelines/_source_separation_pipeline.py +1 -1
torchaudio/pipelines/_squim_pipeline.py +2 -2
torchaudio/pipelines/_tts/utils.py +1 -1
torchaudio/pipelines/rnnt_pipeline.py +4 -4
torchaudio/transforms/__init__.py +1 -0
torchaudio/transforms/_transforms.py +2 -2
torchaudio/utils/__init__.py +2 -9
torchaudio/utils/download.py +1 -3
torchaudio/version.py +2 -2
{torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/METADATA +8 -11
torchaudio-2.9.0.dist-info/RECORD +85 -0
{torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
torchaudio/_backend/__init__.py +0 -61
torchaudio/_backend/backend.py +0 -53
torchaudio/_backend/common.py +0 -52
torchaudio/_backend/ffmpeg.py +0 -334
torchaudio/_backend/soundfile.py +0 -54
torchaudio/_backend/soundfile_backend.py +0 -457
torchaudio/_backend/sox.py +0 -91
torchaudio/_backend/utils.py +0 -350
torchaudio/backend/__init__.py +0 -8
torchaudio/backend/_no_backend.py +0 -25
torchaudio/backend/_sox_io_backend.py +0 -294
torchaudio/backend/common.py +0 -13
torchaudio/backend/no_backend.py +0 -14
torchaudio/backend/soundfile_backend.py +0 -14
torchaudio/backend/sox_io_backend.py +0 -14
torchaudio/io/__init__.py +0 -20
torchaudio/io/_effector.py +0 -347
torchaudio/io/_playback.py +0 -72
torchaudio/kaldi_io.py +0 -150
torchaudio/prototype/__init__.py +0 -0
torchaudio/prototype/datasets/__init__.py +0 -4
torchaudio/prototype/datasets/musan.py +0 -68
torchaudio/prototype/functional/__init__.py +0 -26
torchaudio/prototype/functional/_dsp.py +0 -441
torchaudio/prototype/functional/_rir.py +0 -382
torchaudio/prototype/functional/functional.py +0 -193
torchaudio/prototype/models/__init__.py +0 -39
torchaudio/prototype/models/_conformer_wav2vec2.py +0 -801
torchaudio/prototype/models/_emformer_hubert.py +0 -337
torchaudio/prototype/models/conv_emformer.py +0 -529
torchaudio/prototype/models/hifi_gan.py +0 -342
torchaudio/prototype/models/rnnt.py +0 -717
torchaudio/prototype/models/rnnt_decoder.py +0 -402
torchaudio/prototype/pipelines/__init__.py +0 -21
torchaudio/prototype/pipelines/_vggish/__init__.py +0 -7
torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -236
torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -83
torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -233
torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
torchaudio/prototype/transforms/__init__.py +0 -9
torchaudio/prototype/transforms/_transforms.py +0 -461
torchaudio/sox_effects/__init__.py +0 -10
torchaudio/sox_effects/sox_effects.py +0 -275
torchaudio/utils/ffmpeg_utils.py +0 -11
torchaudio/utils/sox_utils.py +0 -118
torchaudio-2.8.0.dist-info/RECORD +0 -145
torio/__init__.py +0 -8
torio/_extension/__init__.py +0 -13
torio/_extension/utils.py +0 -147
torio/io/__init__.py +0 -9
torio/io/_streaming_media_decoder.py +0 -977
torio/io/_streaming_media_encoder.py +0 -502
torio/lib/__init__.py +0 -0
torio/lib/_torio_ffmpeg4.pyd +0 -0
torio/lib/_torio_ffmpeg5.pyd +0 -0
torio/lib/_torio_ffmpeg6.pyd +0 -0
torio/lib/libtorio_ffmpeg4.pyd +0 -0
torio/lib/libtorio_ffmpeg5.pyd +0 -0
torio/lib/libtorio_ffmpeg6.pyd +0 -0
torio/utils/__init__.py +0 -4
torio/utils/ffmpeg_utils.py +0 -275
{torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
{torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/licenses/LICENSE +0 -0

torchaudio/prototype/models/_emformer_hubert.py DELETED Viewed

@@ -1,337 +0,0 @@
-from typing import List, Optional, Tuple
-import torch
-from torchaudio.models import Wav2Vec2Model
-from torchaudio.models.emformer import Emformer
-from torchaudio.models.rnnt import _TimeReduction
-from torchaudio._internal.module_utils import dropping_support
-class FeatureEncoder(torch.nn.Module):
-    """Extract features from log-mel spectrogram input. Consists of linear layer and time reduction layer.
-    Args:
-        input_dim (int): The feature dimension of log-mel spectrogram feature.
-        output_dim (int): The feature dimension after linear layer.
-        use_bias (bool): If ``True``, enable bias parameter in the linear layer.
-        stride (int): Number of frames to merge for the output frame.
-    """
-    def __init__(self, input_dim: int, output_dim: int, use_bias: bool, stride: int):
-        super().__init__()
-        self.linear = torch.nn.Linear(input_dim, output_dim, bias=use_bias)
-        self.time_reduction = _TimeReduction(stride)
-    def forward(
-        self, input: torch.Tensor, lengths: Optional[torch.Tensor]
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        """
-        Args:
-            input (torch.Tensor): The log-mel spectrogram input.
-                Tensor with dimensions `(batch, time, input_dim)`.
-            lengths (torch.Tensor or None): Valid length of each input sample.
-                Tensor with dimension `(batch, )`.
-        Returns:
-            (torch.Tensor, torch.Tensor or None):
-                torch.Tensor
-                    Returned feature Tensor after linear layer and time reduction layer.
-                    Tensor with dimensions `(batch, time // stride, output_dim)`.
-                torch.Tensor or None
-                    The reduced lengths Tensor.
-        """
-        output = self.linear(input)
-        if lengths is None:
-            B, T, _ = input.shape
-            dummy_lengths = torch.full((B,), T)
-            output, _ = self.time_reduction(output, dummy_lengths)
-        else:
-            output, lengths = self.time_reduction(output, lengths)
-        return output, lengths
-class EmformerEncoder(torch.nn.Module):
-    """Emformer Encoder class for HuBERT pre-training. Consists of emformer module,
-        linear layer and layer normalization layer.
-    Args:
-        emformer (torch.nn.Module):
-            :py:class:`torchaudio.models.Emformer` module that consists of a list of emformer layers.
-        output_linear (torch.nn.Module):
-            Linear layer after emformer module.
-        layer_norm (torch.nn.Module):
-            Apply layer normalization to the output.
-    """
-    def __init__(
-        self,
-        emformer: torch.nn.Module,
-        output_linear: torch.nn.Module,
-        layer_norm: torch.nn.Module,
-    ):
-        super().__init__()
-        self.emformer = emformer
-        self.output_linear = output_linear
-        self.layer_norm = layer_norm
-    def forward(
-        self,
-        input: torch.Tensor,
-        lengths: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        """
-        Args:
-            input (torch.Tensor): The input feature for emformer encoder.
-                Tensor with dimensions `(batch, time, feature_dim)`.
-            lengths (torch.Tensor or None): Valid length of each input sample.
-                Tensor with dimension `(batch, )`.
-        Returns:
-            torch.Tensor: The feature Tensor after emformer encoder.
-        """
-        if lengths is None:
-            B, T, _ = input.shape
-            dummy_lengths = torch.full((B,), T)
-            output, _ = self.emformer(input, dummy_lengths)
-        else:
-            output, lengths = self.emformer(input, lengths)
-        output = self.output_linear(output)
-        output = self.layer_norm(output)
-        return output
-    def extract_features(
-        self,
-        input: torch.Tensor,
-        lengths: Optional[torch.Tensor],
-        num_layers: Optional[int] = None,
-    ) -> List[torch.Tensor]:
-        """Extract output Tensors of the emformer layers.
-        Args:
-            input (torch.Tensor): The input feature for emformer encoder.
-                Tensor with dimensions `(batch, time, feature_dim)`.
-            lengths (torch.Tensor or None): Valid length of each input sample.
-                Tensor with dimension `(batch, )`.
-            num_layers (int or None, optional): If not ``None``, returns the first
-                `num_layers` layers of Tensors as the output, otherwise returns the
-                Tensors from all emformer layers.
-        Returns:
-            List[torch.Tensor]:
-                Output Tensors of selected emformer layers.
-        """
-        if num_layers is not None:
-            if not 0 < num_layers <= len(self.emformer.emformer_layers):
-                raise ValueError(f"`num_layers` must be between [1, {len(self.emformer.emformer_layers)}]")
-        ret: List[torch.Tensor] = []
-        input = input.permute(1, 0, 2)
-        right_context = self.emformer._gen_right_context(input)
-        utterance = input[: input.size(0) - self.emformer.right_context_length]
-        attention_mask = self.emformer._gen_attention_mask(utterance)
-        mems = (
-            self.emformer.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)[:-1]
-            if self.emformer.use_mem
-            else torch.empty(0).to(dtype=input.dtype, device=input.device)
-        )
-        output = utterance
-        if lengths is None:
-            B, T, _ = input.shape
-            lengths = torch.full((B,), T)
-        for layer in self.emformer.emformer_layers:
-            output, right_context, mems = layer(output, lengths, right_context, mems, attention_mask)
-            ret.append(output.permute(1, 0, 2))
-            if num_layers is not None and len(ret) >= num_layers:
-                return ret
-        return ret
-def _get_emformer_feature_extractor(input_dim: int, output_dim: int, use_bias: bool, stride: int) -> FeatureEncoder:
-    """Construct FeatureEncoder for emformer model.
-    Args:
-        input_dim (int): The feature dimension of log-mel spectrogram feature.
-        output_dim (int): The feature dimension after linear layer.
-        use_bias (bool): If ``True``, enable bias parameter in the linear layer.
-        stride (int): Number of frames to merge for the output frame.
-    Returns:
-        FeatureEncoder: The resulting FeatureEncoder module.
-    """
-    return FeatureEncoder(input_dim, output_dim, use_bias, stride)
-def _get_emformer_encoder(
-    input_dim: int,
-    output_dim: int,
-    num_heads: int,
-    ffn_dim: int,
-    num_layers: int,
-    segment_length: int,
-    left_context_length: int,
-    right_context_length: int,
-    dropout: float,
-    activation: str,
-    max_memory_size: int,
-    weight_init_scale_strategy: Optional[str],
-    tanh_on_mem: bool,
-) -> EmformerEncoder:
-    """Construct EmformerEncoder for emformer model.
-    Args:
-        input_dim (int): The feature dimension of input Tensor.
-        output_dim (int): The feature dimension after EmformerEncoder.
-        num_heads (int): Number of attention heads in each Emformer layer.
-        ffn_dim: (int): Hidden layer dimension of feedforward network.
-        num_layers (int): Number of Emformer layers to instantiate.
-        segment_length (int): Length of each input segment.
-        left_context_length (int): Length of left context.
-        right_context_length (int): Length of right context.
-        dropout (float): Dropout probability.
-        activation (str): Activation function to use in each Emformer layer's
-            feedforward network. Must be one of ("relu", "gelu", "silu").
-        max_memory_size (int): Maximum number of memory elements to use.
-        weight_init_scale_strategy (str or None): Per-layer weight initialization scaling
-            strategy. Must be one of ("depthwise", "constant", ``None``).
-        tanh_on_mem (bool): If ``True``, applies tanh to memory elements.
-    Returns:
-        EmformerEncoder: The resulting EmformerEncoder module.
-    """
-    emformer = Emformer(
-        input_dim=input_dim,
-        num_heads=num_heads,
-        ffn_dim=ffn_dim,
-        num_layers=num_layers,
-        segment_length=segment_length,
-        left_context_length=left_context_length,
-        right_context_length=right_context_length,
-        dropout=dropout,
-        activation=activation,
-        max_memory_size=max_memory_size,
-        weight_init_scale_strategy=weight_init_scale_strategy,
-        tanh_on_mem=tanh_on_mem,
-    )
-    output_linear = torch.nn.Linear(input_dim, output_dim)
-    layer_norm = torch.nn.LayerNorm(output_dim)
-    return EmformerEncoder(emformer, output_linear, layer_norm)
-@dropping_support
-def emformer_hubert_model(
-    extractor_input_dim: int,
-    extractor_output_dim: int,
-    extractor_use_bias: bool,
-    extractor_stride: int,
-    encoder_input_dim: int,
-    encoder_output_dim: int,
-    encoder_num_heads: int,
-    encoder_ffn_dim: int,
-    encoder_num_layers: int,
-    encoder_segment_length: int,
-    encoder_left_context_length: int,
-    encoder_right_context_length: int,
-    encoder_dropout: float,
-    encoder_activation: str,
-    encoder_max_memory_size: int,
-    encoder_weight_init_scale_strategy: Optional[str],
-    encoder_tanh_on_mem: bool,
-    aux_num_out: Optional[int],
-) -> Wav2Vec2Model:
-    """Build a custom Emformer HuBERT model.
-    Args:
-        extractor_input_dim (int): The input dimension for feature extractor.
-        extractor_output_dim (int): The output dimension after feature extractor.
-        extractor_use_bias (bool): If ``True``, enable bias parameter in the linear layer of feature extractor.
-        extractor_stride (int): Number of frames to merge for the output frame in feature extractor.
-        encoder_input_dim (int): The input dimension for Emformer layer.
-        encoder_output_dim (int): The output dimension after EmformerEncoder.
-        encoder_num_heads (int): Number of attention heads in each Emformer layer.
-        encoder_ffn_dim (int): Hidden layer dimension of feedforward network in Emformer.
-        encoder_num_layers (int): Number of Emformer layers to instantiate.
-        encoder_segment_length (int): Length of each input segment.
-        encoder_left_context_length (int): Length of left context.
-        encoder_right_context_length (int): Length of right context.
-        encoder_dropout (float): Dropout probability.
-        encoder_activation (str): Activation function to use in each Emformer layer's
-            feedforward network. Must be one of ("relu", "gelu", "silu").
-        encoder_max_memory_size (int): Maximum number of memory elements to use.
-        encoder_weight_init_scale_strategy (str or None): Per-layer weight initialization scaling
-            strategy. Must be one of ("depthwise", "constant", ``None``).
-        encoder_tanh_on_mem (bool): If ``True``, applies tanh to memory elements.
-        aux_num_out (int or None):
-            When provided, attach an extra linear layer on top of encoder, which can be
-            used for fine-tuning.
-    Returns:
-        Wav2Vec2Model:
-            The resulting :py:class:`torchaudio.models.Wav2Vec2Model` model
-            with a :py:class:`torchaudio.models.Emformer` encoder.
-    """
-    feature_extractor = _get_emformer_feature_extractor(
-        extractor_input_dim, extractor_output_dim, extractor_use_bias, extractor_stride
-    )
-    emformer = _get_emformer_encoder(
-        encoder_input_dim,
-        encoder_output_dim,
-        encoder_num_heads,
-        encoder_ffn_dim,
-        encoder_num_layers,
-        encoder_segment_length,
-        encoder_left_context_length,
-        encoder_right_context_length,
-        encoder_dropout,
-        encoder_activation,
-        encoder_max_memory_size,
-        encoder_weight_init_scale_strategy,
-        encoder_tanh_on_mem,
-    )
-    aux = None
-    if aux_num_out is not None:
-        aux = torch.nn.Linear(in_features=encoder_output_dim, out_features=aux_num_out)
-    return Wav2Vec2Model(feature_extractor, emformer, aux)
-@dropping_support
-def emformer_hubert_base(
-    extractor_input_dim: int = 80,
-    extractor_output_dim: int = 128,
-    encoder_dropout: float = 0.1,
-    aux_num_out: Optional[int] = None,
-) -> Wav2Vec2Model:
-    """Build Emformer HuBERT Model with 20 Emformer layers.
-    Args:
-        extractor_input_dim (int, optional): The input dimension for feature extractor. (Default: 80)
-        extractor_output_dim (int, optional): The output dimension after feature extractor. (Default: 128)
-        encoder_dropout (float, optional): Dropout probability in Emformer. (Default: 0.1)
-        aux_num_out (int or None, optional): Output dimension of aux layer for fine-tuning. (Default: ``None``)
-    Returns:
-        Wav2Vec2Model:
-            The resulting :py:class:`torchaudio.models.Wav2Vec2Model` model
-            with a :py:class:`torchaudio.models.Emformer` encoder.
-    """
-    return emformer_hubert_model(
-        extractor_input_dim=extractor_input_dim,
-        extractor_output_dim=extractor_output_dim,
-        extractor_use_bias=False,
-        extractor_stride=4,
-        encoder_input_dim=512,
-        encoder_output_dim=1024,
-        encoder_num_heads=8,
-        encoder_ffn_dim=2048,
-        encoder_num_layers=20,
-        encoder_segment_length=4,
-        encoder_left_context_length=30,
-        encoder_right_context_length=1,
-        encoder_dropout=encoder_dropout,
-        encoder_activation="gelu",
-        encoder_max_memory_size=0,
-        encoder_weight_init_scale_strategy="depthwise",
-        encoder_tanh_on_mem=True,
-        aux_num_out=aux_num_out,
-    )