PyPI - torchaudio - Versions diffs - 2.9.0__cp314-cp314-macosx_11_0_arm64.whl - Mend

torchaudio 2.9.0__cp314-cp314-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchaudio might be problematic. Click here for more details.

Files changed (86) hide show

torchaudio/.dylibs/libc++.1.0.dylib +0 -0
torchaudio/__init__.py +204 -0
torchaudio/_extension/__init__.py +61 -0
torchaudio/_extension/utils.py +133 -0
torchaudio/_internal/__init__.py +10 -0
torchaudio/_internal/module_utils.py +171 -0
torchaudio/_torchcodec.py +340 -0
torchaudio/compliance/__init__.py +5 -0
torchaudio/compliance/kaldi.py +813 -0
torchaudio/datasets/__init__.py +47 -0
torchaudio/datasets/cmuarctic.py +157 -0
torchaudio/datasets/cmudict.py +186 -0
torchaudio/datasets/commonvoice.py +86 -0
torchaudio/datasets/dr_vctk.py +121 -0
torchaudio/datasets/fluentcommands.py +108 -0
torchaudio/datasets/gtzan.py +1118 -0
torchaudio/datasets/iemocap.py +147 -0
torchaudio/datasets/librilight_limited.py +111 -0
torchaudio/datasets/librimix.py +133 -0
torchaudio/datasets/librispeech.py +174 -0
torchaudio/datasets/librispeech_biasing.py +189 -0
torchaudio/datasets/libritts.py +168 -0
torchaudio/datasets/ljspeech.py +107 -0
torchaudio/datasets/musdb_hq.py +139 -0
torchaudio/datasets/quesst14.py +136 -0
torchaudio/datasets/snips.py +157 -0
torchaudio/datasets/speechcommands.py +183 -0
torchaudio/datasets/tedlium.py +218 -0
torchaudio/datasets/utils.py +54 -0
torchaudio/datasets/vctk.py +143 -0
torchaudio/datasets/voxceleb1.py +309 -0
torchaudio/datasets/yesno.py +89 -0
torchaudio/functional/__init__.py +130 -0
torchaudio/functional/_alignment.py +128 -0
torchaudio/functional/filtering.py +1685 -0
torchaudio/functional/functional.py +2505 -0
torchaudio/lib/__init__.py +0 -0
torchaudio/lib/_torchaudio.so +0 -0
torchaudio/lib/libtorchaudio.so +0 -0
torchaudio/models/__init__.py +85 -0
torchaudio/models/_hdemucs.py +1008 -0
torchaudio/models/conformer.py +293 -0
torchaudio/models/conv_tasnet.py +330 -0
torchaudio/models/decoder/__init__.py +64 -0
torchaudio/models/decoder/_ctc_decoder.py +568 -0
torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
torchaudio/models/deepspeech.py +84 -0
torchaudio/models/emformer.py +884 -0
torchaudio/models/rnnt.py +816 -0
torchaudio/models/rnnt_decoder.py +339 -0
torchaudio/models/squim/__init__.py +11 -0
torchaudio/models/squim/objective.py +326 -0
torchaudio/models/squim/subjective.py +150 -0
torchaudio/models/tacotron2.py +1046 -0
torchaudio/models/wav2letter.py +72 -0
torchaudio/models/wav2vec2/__init__.py +45 -0
torchaudio/models/wav2vec2/components.py +1167 -0
torchaudio/models/wav2vec2/model.py +1579 -0
torchaudio/models/wav2vec2/utils/__init__.py +7 -0
torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
torchaudio/models/wavernn.py +409 -0
torchaudio/pipelines/__init__.py +102 -0
torchaudio/pipelines/_source_separation_pipeline.py +109 -0
torchaudio/pipelines/_squim_pipeline.py +156 -0
torchaudio/pipelines/_tts/__init__.py +16 -0
torchaudio/pipelines/_tts/impl.py +385 -0
torchaudio/pipelines/_tts/interface.py +255 -0
torchaudio/pipelines/_tts/utils.py +230 -0
torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
torchaudio/pipelines/_wav2vec2/utils.py +346 -0
torchaudio/pipelines/rnnt_pipeline.py +380 -0
torchaudio/transforms/__init__.py +78 -0
torchaudio/transforms/_multi_channel.py +467 -0
torchaudio/transforms/_transforms.py +2138 -0
torchaudio/utils/__init__.py +4 -0
torchaudio/utils/download.py +89 -0
torchaudio/version.py +2 -0
torchaudio-2.9.0.dist-info/LICENSE +25 -0
torchaudio-2.9.0.dist-info/METADATA +122 -0
torchaudio-2.9.0.dist-info/RECORD +86 -0
torchaudio-2.9.0.dist-info/WHEEL +5 -0
torchaudio-2.9.0.dist-info/top_level.txt +1 -0

torchaudio/pipelines/_wav2vec2/utils.py ADDED Viewed

@@ -0,0 +1,346 @@
+from typing import List, Optional, Tuple
+import torch
+from torch import nn, Tensor
+from torchaudio._internal import load_state_dict_from_url
+from torchaudio.models import wav2vec2_model, Wav2Vec2Model, wavlm_model
+def _get_model(type_, params):
+    factories = {
+        "Wav2Vec2": wav2vec2_model,
+        "WavLM": wavlm_model,
+    }
+    if type_ not in factories:
+        raise ValueError(f"Supported model types are {tuple(factories.keys())}. Found: {type_}")
+    factory = factories[type_]
+    return factory(**params)
+class _Wav2Vec2Model(nn.Module):
+    """Wrapper class for :py:class:`~torchaudio.models.Wav2Vec2Model`.
+    This is used for layer normalization at the input
+    """
+    def __init__(self, model: Wav2Vec2Model, normalize_waveform: bool, apply_log_softmax: bool, append_star: bool):
+        super().__init__()
+        self.model = model
+        self.normalize_waveform = normalize_waveform
+        self.apply_log_softmax = apply_log_softmax
+        self.append_star = append_star
+    def forward(self, waveforms: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
+        if self.normalize_waveform:
+            waveforms = nn.functional.layer_norm(waveforms, waveforms.shape)
+        output, output_lengths = self.model(waveforms, lengths)
+        if self.apply_log_softmax:
+            output = torch.nn.functional.log_softmax(output, dim=-1)
+        if self.append_star:
+            star_dim = torch.zeros((1, output.size(1), 1), dtype=output.dtype, device=output.device)
+            output = torch.cat((output, star_dim), dim=-1)
+        return output, output_lengths
+    @torch.jit.export
+    def extract_features(
+        self,
+        waveforms: Tensor,
+        lengths: Optional[Tensor] = None,
+        num_layers: Optional[int] = None,
+    ) -> Tuple[List[Tensor], Optional[Tensor]]:
+        if self.normalize_waveform:
+            waveforms = nn.functional.layer_norm(waveforms, waveforms.shape)
+        return self.model.extract_features(waveforms, lengths, num_layers)
+def _extend_model(module, normalize_waveform, apply_log_softmax=False, append_star=False):
+    """Add extra transformations to the model"""
+    return _Wav2Vec2Model(module, normalize_waveform, apply_log_softmax, append_star)
+def _remove_aux_axes(state_dict, axes):
+    # Remove the seemingly unnecessary axis
+    # For ASR task, the pretrained weights originated from fairseq has unrelated dimensions at index 1, 2, 3
+    # It's originated from the Dictionary implementation of fairseq, which was intended for NLP tasks,
+    # but not used during the ASR training.
+    # https://github.com/pytorch/fairseq/blob/c5ff181125c7e6126b49a85e5ebdd5f5b6a07914/fairseq/data/dictionary.py#L21-L37
+    # https://github.com/pytorch/fairseq/blob/c5ff181125c7e6126b49a85e5ebdd5f5b6a07914/fairseq/criterions/ctc.py#L126-L129
+    #
+    # Also, some pretrained weights originated from voxpopuli has an extra dimensions that almost never used and
+    # that resembles mistake.
+    # The label `1` shows up in the training dataset of German (1 out of 16M),
+    # English (1 / 28M), Spanish (1 / 9.4M), Romanian (1 / 4.7M) and Polish (6 / 5.8M)
+    for key in ["aux.weight", "aux.bias"]:
+        mat = state_dict[key]
+        state_dict[key] = torch.stack([mat[i] for i in range(mat.size(0)) if i not in axes])
+def _get_state_dict(url, dl_kwargs, remove_axes=None):
+    if not url.startswith("https"):
+        url = f"https://download.pytorch.org/torchaudio/models/{url}"
+    dl_kwargs = {} if dl_kwargs is None else dl_kwargs
+    state_dict = load_state_dict_from_url(url, **dl_kwargs)
+    if remove_axes:
+        _remove_aux_axes(state_dict, remove_axes)
+    return state_dict
+def _get_en_labels():
+    return (
+        "|",
+        "E",
+        "T",
+        "A",
+        "O",
+        "N",
+        "I",
+        "H",
+        "S",
+        "R",
+        "D",
+        "L",
+        "U",
+        "M",
+        "W",
+        "C",
+        "F",
+        "G",
+        "Y",
+        "P",
+        "B",
+        "V",
+        "K",
+        "'",
+        "X",
+        "J",
+        "Q",
+        "Z",
+    )
+def _get_de_labels():
+    return (
+        "|",
+        "e",
+        "n",
+        "i",
+        "r",
+        "s",
+        "t",
+        "a",
+        "d",
+        "h",
+        "u",
+        "l",
+        "g",
+        "c",
+        "m",
+        "o",
+        "b",
+        "w",
+        "f",
+        "k",
+        "z",
+        "p",
+        "v",
+        "ü",
+        "ä",
+        "ö",
+        "j",
+        "ß",
+        "y",
+        "x",
+        "q",
+    )
+def _get_vp_en_labels():
+    return (
+        "|",
+        "e",
+        "t",
+        "o",
+        "i",
+        "a",
+        "n",
+        "s",
+        "r",
+        "h",
+        "l",
+        "d",
+        "c",
+        "u",
+        "m",
+        "p",
+        "f",
+        "g",
+        "w",
+        "y",
+        "b",
+        "v",
+        "k",
+        "x",
+        "j",
+        "q",
+        "z",
+    )
+def _get_es_labels():
+    return (
+        "|",
+        "e",
+        "a",
+        "o",
+        "s",
+        "n",
+        "r",
+        "i",
+        "l",
+        "d",
+        "c",
+        "t",
+        "u",
+        "p",
+        "m",
+        "b",
+        "q",
+        "y",
+        "g",
+        "v",
+        "h",
+        "ó",
+        "f",
+        "í",
+        "á",
+        "j",
+        "z",
+        "ñ",
+        "é",
+        "x",
+        "ú",
+        "k",
+        "w",
+        "ü",
+    )
+def _get_fr_labels():
+    return (
+        "|",
+        "e",
+        "s",
+        "n",
+        "i",
+        "t",
+        "r",
+        "a",
+        "o",
+        "u",
+        "l",
+        "d",
+        "c",
+        "p",
+        "m",
+        "é",
+        "v",
+        "q",
+        "f",
+        "g",
+        "b",
+        "h",
+        "x",
+        "à",
+        "j",
+        "è",
+        "y",
+        "ê",
+        "z",
+        "ô",
+        "k",
+        "ç",
+        "œ",
+        "û",
+        "ù",
+        "î",
+        "â",
+        "w",
+        "ï",
+        "ë",
+        "ü",
+        "æ",
+    )
+def _get_it_labels():
+    return (
+        "|",
+        "e",
+        "i",
+        "a",
+        "o",
+        "n",
+        "t",
+        "r",
+        "l",
+        "s",
+        "c",
+        "d",
+        "u",
+        "p",
+        "m",
+        "g",
+        "v",
+        "h",
+        "z",
+        "f",
+        "b",
+        "q",
+        "à",
+        "è",
+        "ù",
+        "é",
+        "ò",
+        "ì",
+        "k",
+        "y",
+        "x",
+        "w",
+        "j",
+        "ó",
+        "í",
+        "ï",
+    )
+def _get_mms_labels():
+    return (
+        "a",
+        "i",
+        "e",
+        "n",
+        "o",
+        "u",
+        "t",
+        "s",
+        "r",
+        "m",
+        "k",
+        "l",
+        "d",
+        "g",
+        "h",
+        "y",
+        "b",
+        "p",
+        "w",
+        "c",
+        "v",
+        "j",
+        "z",
+        "f",
+        "'",
+        "q",
+        "x",
+    )