PyPI - torchaudio - Versions diffs - 2.0.2__cp38-cp38-manylinux1_x86_64.whl → 2.1.1__cp38-cp38-manylinux1_x86_64.whl - Mend

torchaudio 2.0.2__cp38-cp38-manylinux1_x86_64.whl → 2.1.1__cp38-cp38-manylinux1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show

torchaudio/__init__.py +22 -3
torchaudio/_backend/__init__.py +55 -4
torchaudio/_backend/backend.py +53 -0
torchaudio/_backend/common.py +52 -0
torchaudio/_backend/ffmpeg.py +373 -0
torchaudio/_backend/soundfile.py +54 -0
torchaudio/_backend/soundfile_backend.py +457 -0
torchaudio/_backend/sox.py +91 -0
torchaudio/_backend/utils.py +81 -323
torchaudio/_extension/__init__.py +55 -36
torchaudio/_extension/utils.py +109 -17
torchaudio/_internal/__init__.py +4 -1
torchaudio/_internal/module_utils.py +37 -6
torchaudio/backend/__init__.py +7 -11
torchaudio/backend/_no_backend.py +24 -0
torchaudio/backend/_sox_io_backend.py +297 -0
torchaudio/backend/common.py +12 -52
torchaudio/backend/no_backend.py +11 -21
torchaudio/backend/soundfile_backend.py +11 -448
torchaudio/backend/sox_io_backend.py +11 -435
torchaudio/backend/utils.py +9 -18
torchaudio/datasets/__init__.py +2 -0
torchaudio/datasets/cmuarctic.py +1 -1
torchaudio/datasets/cmudict.py +61 -62
torchaudio/datasets/dr_vctk.py +1 -1
torchaudio/datasets/gtzan.py +1 -1
torchaudio/datasets/librilight_limited.py +1 -1
torchaudio/datasets/librispeech.py +1 -1
torchaudio/datasets/librispeech_biasing.py +189 -0
torchaudio/datasets/libritts.py +1 -1
torchaudio/datasets/ljspeech.py +1 -1
torchaudio/datasets/musdb_hq.py +1 -1
torchaudio/datasets/quesst14.py +1 -1
torchaudio/datasets/speechcommands.py +1 -1
torchaudio/datasets/tedlium.py +1 -1
torchaudio/datasets/vctk.py +1 -1
torchaudio/datasets/voxceleb1.py +1 -1
torchaudio/datasets/yesno.py +1 -1
torchaudio/functional/__init__.py +6 -2
torchaudio/functional/_alignment.py +128 -0
torchaudio/functional/filtering.py +69 -92
torchaudio/functional/functional.py +99 -148
torchaudio/io/__init__.py +4 -1
torchaudio/io/_effector.py +347 -0
torchaudio/io/_stream_reader.py +158 -90
torchaudio/io/_stream_writer.py +196 -10
torchaudio/lib/_torchaudio.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg4.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg5.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg6.so +0 -0
torchaudio/lib/_torchaudio_sox.so +0 -0
torchaudio/lib/libctc_prefix_decoder.so +0 -0
torchaudio/lib/libtorchaudio.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg4.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg5.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg6.so +0 -0
torchaudio/lib/libtorchaudio_sox.so +0 -0
torchaudio/lib/pybind11_prefixctc.so +0 -0
torchaudio/models/__init__.py +14 -0
torchaudio/models/decoder/__init__.py +22 -7
torchaudio/models/decoder/_ctc_decoder.py +123 -69
torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
torchaudio/models/rnnt_decoder.py +10 -14
torchaudio/models/squim/__init__.py +11 -0
torchaudio/models/squim/objective.py +326 -0
torchaudio/models/squim/subjective.py +150 -0
torchaudio/models/wav2vec2/components.py +6 -10
torchaudio/pipelines/__init__.py +9 -0
torchaudio/pipelines/_squim_pipeline.py +176 -0
torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
torchaudio/pipelines/_wav2vec2/impl.py +198 -68
torchaudio/pipelines/_wav2vec2/utils.py +120 -0
torchaudio/sox_effects/sox_effects.py +7 -30
torchaudio/transforms/__init__.py +2 -0
torchaudio/transforms/_transforms.py +99 -54
torchaudio/utils/download.py +2 -2
torchaudio/utils/ffmpeg_utils.py +20 -15
torchaudio/utils/sox_utils.py +8 -9
torchaudio/version.py +2 -2
torchaudio-2.1.1.dist-info/METADATA +113 -0
torchaudio-2.1.1.dist-info/RECORD +119 -0
torchaudio/io/_compat.py +0 -241
torchaudio/lib/_torchaudio_ffmpeg.so +0 -0
torchaudio/lib/flashlight_lib_text_decoder.so +0 -0
torchaudio/lib/flashlight_lib_text_dictionary.so +0 -0
torchaudio/lib/libflashlight-text.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg.so +0 -0
torchaudio-2.0.2.dist-info/METADATA +0 -26
torchaudio-2.0.2.dist-info/RECORD +0 -100
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/LICENSE +0 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/WHEEL +0 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/top_level.txt +0 -0

torchaudio/pipelines/_wav2vec2/impl.py CHANGED Viewed

@@ -1,41 +1,12 @@
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple
-import torch
-from torch import Tensor
-from torch.nn import functional as F, Module
-from torchaudio._internal import load_state_dict_from_url
-from torchaudio.models import wav2vec2_model, Wav2Vec2Model, wavlm_model
+from torch.nn import Module
-from . import utils
+from . import aligner, utils
-__all__ = []
-class _Wav2Vec2Model(Module):
-    """Wrapper class for :py:class:`~torchaudio.models.Wav2Vec2Model`.
-    This is used for layer normalization at the input
-    """
-    def __init__(self, model: Wav2Vec2Model):
-        super().__init__()
-        self.model = model
-    def forward(self, waveforms: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
-        waveforms = F.layer_norm(waveforms, waveforms.shape)
-        return self.model(waveforms, lengths)
-    @torch.jit.export
-    def extract_features(
-        self,
-        waveforms: Tensor,
-        lengths: Optional[Tensor] = None,
-        num_layers: Optional[int] = None,
-    ) -> Tuple[List[Tensor], Optional[Tensor]]:
-        waveforms = F.layer_norm(waveforms, waveforms.shape)
-        return self.model.extract_features(waveforms, lengths, num_layers)
+__all__ = []  # type: ignore
 @dataclass
@@ -84,10 +55,8 @@ class Wav2Vec2Bundle:
         return self._sample_rate
     def _get_state_dict(self, dl_kwargs):
-        url = f"https://download.pytorch.org/torchaudio/models/{self._path}"
-        dl_kwargs = {} if dl_kwargs is None else dl_kwargs
-        state_dict = load_state_dict_from_url(url, **dl_kwargs)
-        return state_dict
+        # Note: This method is overridden in ASR bundle
+        return utils._get_state_dict(self._path, dl_kwargs)
     def get_model(self, *, dl_kwargs=None) -> Module:
         """Construct the model and load the pretrained weight.
@@ -119,13 +88,11 @@ class Wav2Vec2Bundle:
             - HUBERT_ASR_XLARGE
             - WAVLM_LARGE
         """
-        if self._model_type == "WavLM":
-            model = wavlm_model(**self._params)
-        else:
-            model = wav2vec2_model(**self._params)
-        model.load_state_dict(self._get_state_dict(dl_kwargs))
+        model = utils._get_model(self._model_type, self._params)
+        state_dict = self._get_state_dict(dl_kwargs)
+        model.load_state_dict(state_dict)
         if self._normalize_waveform:
-            model = _Wav2Vec2Model(model)
+            model = utils._extend_model(model, normalize_waveform=True)
         model.eval()
         return model
@@ -171,15 +138,15 @@ class Wav2Vec2ASRBundle(Wav2Vec2Bundle):
         >>> transcripts = ctc_decode(emissions, labels)
     """  # noqa: E501
-    _labels: Tuple[str]
-    _remove_aux_axis: Tuple[int] = (1, 2, 3)
+    _labels: Tuple[str, ...]
+    _remove_aux_axis: Tuple[int, ...] = (1, 2, 3)
     def get_labels(
         self,
         *,
         blank: str = "-",
-    ) -> Tuple[str]:
-        """The output class labels (only applicable to fine-tuned bundles)
+    ) -> Tuple[str, ...]:
+        """The output class labels.
         The first is blank token, and it is customizable.
@@ -187,35 +154,19 @@ class Wav2Vec2ASRBundle(Wav2Vec2Bundle):
             blank (str, optional): Blank token. (default: ``'-'``)
         Returns:
-            Tuple[str]:
+            Tuple[str, ...]:
             For models fine-tuned on ASR, returns the tuple of strings representing
             the output class labels.
         Example
-            >>> import torchaudio
-            >>> torchaudio.models.HUBERT_ASR_LARGE.get_labels()
+            >>> from torchaudio.pipelines import HUBERT_ASR_LARGE as bundle
+            >>> bundle.get_labels()
             ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
         """  # noqa: E501
         return (blank, *self._labels)
     def _get_state_dict(self, dl_kwargs):
-        state_dict = super()._get_state_dict(dl_kwargs)
-        if self._remove_aux_axis:
-            # Remove the seemingly unnecessary axis
-            # For ASR task, the pretrained weights originated from fairseq has unrelated dimensions at index 1, 2, 3
-            # It's originated from the Dictionary implementation of fairseq, which was intended for NLP tasks,
-            # but not used during the ASR training.
-            # https://github.com/pytorch/fairseq/blob/c5ff181125c7e6126b49a85e5ebdd5f5b6a07914/fairseq/data/dictionary.py#L21-L37
-            # https://github.com/pytorch/fairseq/blob/c5ff181125c7e6126b49a85e5ebdd5f5b6a07914/fairseq/criterions/ctc.py#L126-L129
-            #
-            # Also, some pretrained weights originated from voxpopuli has an extra dimensions that almost never used and
-            # that resembles mistake.
-            # The label `1` shows up in the training dataset of German (1 out of 16M),
-            # English (1 / 28M), Spanish (1 / 9.4M), Romanian (1 / 4.7M) and Polish (6 / 5.8M)
-            for key in ["aux.weight", "aux.bias"]:
-                t = state_dict[key]
-                state_dict[key] = torch.stack([t[i] for i in range(t.size(0)) if i not in self._remove_aux_axis])
-        return state_dict
+        return utils._get_state_dict(self._path, dl_kwargs, self._remove_aux_axis)
 WAV2VEC2_BASE = Wav2Vec2Bundle(
@@ -1399,7 +1350,7 @@ WAVLM_LARGE = Wav2Vec2Bundle(
         "encoder_ff_interm_features": 4096,
         "encoder_ff_interm_dropout": 0.0,
         "encoder_dropout": 0.1,
-        "encoder_layer_norm_first": False,
+        "encoder_layer_norm_first": True,
         "encoder_layer_drop": 0.05,
         "aux_num_out": None,
     },
@@ -1567,3 +1518,182 @@ redistributed with the same license.
 Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for usage details.
 """  # noqa: E501
+@dataclass
+class Wav2Vec2FABundle(Wav2Vec2ASRBundle):
+    """Data class that bundles associated information to use pretrained :py:class:`~torchaudio.models.Wav2Vec2Model` for forced alignment.
+    This class provides interfaces for instantiating the pretrained model along with
+    the information necessary to retrieve pretrained weights and additional data
+    to be used with the model.
+    Torchaudio library instantiates objects of this class, each of which represents
+    a different pretrained model. Client code should access pretrained models via these
+    instances.
+    Please see below for the usage and the available values.
+    Example - Feature Extraction
+        >>> import torchaudio
+        >>>
+        >>> bundle = torchaudio.pipelines.MMS_FA
+        >>>
+        >>> # Build the model and load pretrained weight.
+        >>> model = bundle.get_model()
+        Downloading:
+        100%|███████████████████████████████| 1.18G/1.18G [00:05<00:00, 216MB/s]
+        >>>
+        >>> # Resample audio to the expected sampling rate
+        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
+        >>>
+        >>> # Estimate the probability of token distribution
+        >>> emission, _ = model(waveform)
+        >>>
+        >>> # Generate frame-wise alignment
+        >>> alignment, scores = torchaudio.functional.forced_align(
+        >>>     emission, targets, input_lengths, target_lengths, blank=0)
+        >>>
+    """  # noqa: E501
+    class Tokenizer(aligner.ITokenizer):
+        """Interface of the tokenizer"""
+    class Aligner(aligner.IAligner):
+        """Interface of the aligner"""
+    def get_labels(self, star: Optional[str] = "*", blank: str = "-") -> Tuple[str, ...]:
+        """Get the labels corresponding to the feature dimension of emission.
+        The first is blank token, and it is customizable.
+        Args:
+            star (str or None, optional): Change or disable star token. (default: ``"*"``)
+            blank (str, optional): Change the blank token. (default: ``'-'``)
+        Returns:
+            Tuple[str, ...]:
+            For models fine-tuned on ASR, returns the tuple of strings representing
+            the output class labels.
+        Example
+            >>> from torchaudio.pipelines import MMS_FA as bundle
+            >>> bundle.get_labels()
+            ('-', 'a', 'i', 'e', 'n', 'o', 'u', 't', 's', 'r', 'm', 'k', 'l', 'd', 'g', 'h', 'y', 'b', 'p', 'w', 'c', 'v', 'j', 'z', 'f', "'", 'q', 'x', '*')
+            >>> bundle.get_labels(star=None)
+            ('-', 'a', 'i', 'e', 'n', 'o', 'u', 't', 's', 'r', 'm', 'k', 'l', 'd', 'g', 'h', 'y', 'b', 'p', 'w', 'c', 'v', 'j', 'z', 'f', "'", 'q', 'x')
+        """  # noqa: E501
+        labels = super().get_labels(blank=blank)
+        return labels if star is None else (*labels, star)
+    def get_model(self, with_star: bool = True, *, dl_kwargs=None) -> Module:
+        """Construct the model and load the pretrained weight.
+        The weight file is downloaded from the internet and cached with
+        :func:`torch.hub.load_state_dict_from_url`
+        Args:
+            with_star (bool, optional): If enabled, the last dimension of output layer is
+                extended by one, which corresponds to `star` token.
+            dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
+        Returns:
+            Variation of :py:class:`~torchaudio.models.Wav2Vec2Model`.
+            .. note::
+               The model created with this method returns probability in log-domain,
+               (i.e. :py:func:`torch.nn.functional.log_softmax` is applied), whereas
+               the other Wav2Vec2 models returns logit.
+        """
+        model = utils._get_model(self._model_type, self._params)
+        state_dict = utils._get_state_dict(self._path, dl_kwargs, self._remove_aux_axis)
+        model.load_state_dict(state_dict)
+        model = utils._extend_model(
+            model, normalize_waveform=self._normalize_waveform, apply_log_softmax=True, append_star=with_star
+        )
+        model.eval()
+        return model
+    def get_dict(self, star: Optional[str] = "*", blank: str = "-") -> Dict[str, int]:
+        """Get the mapping from token to index (in emission feature dim)
+        Args:
+            star (str or None, optional): Change or disable star token. (default: ``"*"``)
+            blank (str, optional): Change the blank token. (default: ``'-'``)
+        Returns:
+            Tuple[str, ...]:
+            For models fine-tuned on ASR, returns the tuple of strings representing
+            the output class labels.
+        Example
+            >>> from torchaudio.pipelines import MMS_FA as bundle
+            >>> bundle.get_dict()
+            {'-': 0, 'a': 1, 'i': 2, 'e': 3, 'n': 4, 'o': 5, 'u': 6, 't': 7, 's': 8, 'r': 9, 'm': 10, 'k': 11, 'l': 12, 'd': 13, 'g': 14, 'h': 15, 'y': 16, 'b': 17, 'p': 18, 'w': 19, 'c': 20, 'v': 21, 'j': 22, 'z': 23, 'f': 24, "'": 25, 'q': 26, 'x': 27, '*': 28}
+            >>> bundle.get_dict(star=None)
+            {'-': 0, 'a': 1, 'i': 2, 'e': 3, 'n': 4, 'o': 5, 'u': 6, 't': 7, 's': 8, 'r': 9, 'm': 10, 'k': 11, 'l': 12, 'd': 13, 'g': 14, 'h': 15, 'y': 16, 'b': 17, 'p': 18, 'w': 19, 'c': 20, 'v': 21, 'j': 22, 'z': 23, 'f': 24, "'": 25, 'q': 26, 'x': 27}
+        """  # noqa: E501
+        return {k: i for i, k in enumerate(self.get_labels(star=star, blank=blank))}
+    def get_tokenizer(self) -> Tokenizer:
+        """Instantiate a Tokenizer.
+        Returns:
+            Tokenizer
+        """
+        return aligner.Tokenizer(self.get_dict())
+    def get_aligner(self) -> Aligner:
+        """Instantiate an Aligner.
+        Returns:
+            Aligner
+        """
+        return aligner.Aligner(blank=0)
+MMS_FA = Wav2Vec2FABundle(
+    "https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/model.pt",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 28,
+    },
+    _labels=utils._get_mms_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+MMS_FA.__doc__ = """
+Trained on 31K hours of data in 1,130 languages from *Scaling Speech Technology to 1,000+ Languages* :cite:`pratap2023scaling`.
+Published by the authors of *Scaling Speech Technology to 1,000+ Languages* :cite:`pratap2023scaling` under [`CC-BY-NC 4.0 License <https://github.com/facebookresearch/fairseq/tree/100cd91db19bb27277a06a25eb4154c805b10189/examples/mms#license>`__].
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2FABundle` for usage details.
+.. note::
+   Unlike other Wav2Vec2 bundles, this model does not have a token for word boundary (like `|`). This makes the post-processing of alignments slightly different.
+"""  # noqa: E501

torchaudio/pipelines/_wav2vec2/utils.py CHANGED Viewed

@@ -1,3 +1,91 @@
+from typing import List, Optional, Tuple
+import torch
+from torch import nn, Tensor
+from torchaudio._internal import load_state_dict_from_url
+from torchaudio.models import wav2vec2_model, Wav2Vec2Model, wavlm_model
+def _get_model(type_, params):
+    factories = {
+        "Wav2Vec2": wav2vec2_model,
+        "WavLM": wavlm_model,
+    }
+    if type_ not in factories:
+        raise ValueError(f"Supported model types are {tuple(factories.keys())}. Found: {type_}")
+    factory = factories[type_]
+    return factory(**params)
+class _Wav2Vec2Model(nn.Module):
+    """Wrapper class for :py:class:`~torchaudio.models.Wav2Vec2Model`.
+    This is used for layer normalization at the input
+    """
+    def __init__(self, model: Wav2Vec2Model, normalize_waveform: bool, apply_log_softmax: bool, append_star: bool):
+        super().__init__()
+        self.model = model
+        self.normalize_waveform = normalize_waveform
+        self.apply_log_softmax = apply_log_softmax
+        self.append_star = append_star
+    def forward(self, waveforms: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
+        if self.normalize_waveform:
+            waveforms = nn.functional.layer_norm(waveforms, waveforms.shape)
+        output, output_lengths = self.model(waveforms, lengths)
+        if self.apply_log_softmax:
+            output = torch.nn.functional.log_softmax(output, dim=-1)
+        if self.append_star:
+            star_dim = torch.zeros((1, output.size(1), 1), dtype=output.dtype, device=output.device)
+            output = torch.cat((output, star_dim), dim=-1)
+        return output, output_lengths
+    @torch.jit.export
+    def extract_features(
+        self,
+        waveforms: Tensor,
+        lengths: Optional[Tensor] = None,
+        num_layers: Optional[int] = None,
+    ) -> Tuple[List[Tensor], Optional[Tensor]]:
+        if self.normalize_waveform:
+            waveforms = nn.functional.layer_norm(waveforms, waveforms.shape)
+        return self.model.extract_features(waveforms, lengths, num_layers)
+def _extend_model(module, normalize_waveform, apply_log_softmax=False, append_star=False):
+    """Add extra transformations to the model"""
+    return _Wav2Vec2Model(module, normalize_waveform, apply_log_softmax, append_star)
+def _remove_aux_axes(state_dict, axes):
+    # Remove the seemingly unnecessary axis
+    # For ASR task, the pretrained weights originated from fairseq has unrelated dimensions at index 1, 2, 3
+    # It's originated from the Dictionary implementation of fairseq, which was intended for NLP tasks,
+    # but not used during the ASR training.
+    # https://github.com/pytorch/fairseq/blob/c5ff181125c7e6126b49a85e5ebdd5f5b6a07914/fairseq/data/dictionary.py#L21-L37
+    # https://github.com/pytorch/fairseq/blob/c5ff181125c7e6126b49a85e5ebdd5f5b6a07914/fairseq/criterions/ctc.py#L126-L129
+    #
+    # Also, some pretrained weights originated from voxpopuli has an extra dimensions that almost never used and
+    # that resembles mistake.
+    # The label `1` shows up in the training dataset of German (1 out of 16M),
+    # English (1 / 28M), Spanish (1 / 9.4M), Romanian (1 / 4.7M) and Polish (6 / 5.8M)
+    for key in ["aux.weight", "aux.bias"]:
+        mat = state_dict[key]
+        state_dict[key] = torch.stack([mat[i] for i in range(mat.size(0)) if i not in axes])
+def _get_state_dict(url, dl_kwargs, remove_axes=None):
+    if not url.startswith("https"):
+        url = f"https://download.pytorch.org/torchaudio/models/{url}"
+    dl_kwargs = {} if dl_kwargs is None else dl_kwargs
+    state_dict = load_state_dict_from_url(url, **dl_kwargs)
+    if remove_axes:
+        _remove_aux_axes(state_dict, remove_axes)
+    return state_dict
 def _get_en_labels():
     return (
         "|",
@@ -224,3 +312,35 @@ def _get_it_labels():
         "í",
         "ï",
     )
+def _get_mms_labels():
+    return (
+        "a",
+        "i",
+        "e",
+        "n",
+        "o",
+        "u",
+        "t",
+        "s",
+        "r",
+        "m",
+        "k",
+        "l",
+        "d",
+        "g",
+        "h",
+        "y",
+        "b",
+        "p",
+        "w",
+        "c",
+        "v",
+        "j",
+        "z",
+        "f",
+        "'",
+        "q",
+        "x",
+    )

torchaudio/sox_effects/sox_effects.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-import warnings
 from typing import List, Optional, Tuple
 import torch
@@ -156,14 +155,6 @@ def apply_effects_tensor(
     return torch.ops.torchaudio.sox_effects_apply_effects_tensor(tensor, sample_rate, effects, channels_first)
-_deprecation_message = (
-    "File-like object support in sox_io backend is deprecated, "
-    "and will be removed in v2.1. "
-    "See https://github.com/pytorch/audio/issues/2950 for the detail."
-    "Please migrate to the new dispatcher, or use soundfile backend."
-)
 @torchaudio._extension.fail_if_no_sox
 def apply_effects_file(
     path: str,
@@ -187,18 +178,8 @@ def apply_effects_file(
         rate and leave samples untouched.
     Args:
-        path (path-like object or file-like object):
-            Source of audio data. When the function is not compiled by TorchScript,
-            (e.g. ``torch.jit.script``), the following types are accepted:
-                  * ``path-like``: file path
-                  * ``file-like``: Object with ``read(size: int) -> bytes`` method,
-                    which returns byte string of at most ``size`` length.
-            When the function is compiled by TorchScript, only ``str`` type is allowed.
-            Note: This argument is intentionally annotated as ``str`` only for
-            TorchScript compiler compatibility.
+        path (path-like object):
+            Source of audio data.
         effects (List[List[str]]): List of effects.
         normalize (bool, optional):
             When ``True``, this function converts the native sample type to ``float32``.
@@ -283,13 +264,9 @@ def apply_effects_file(
     """
     if not torch.jit.is_scripting():
         if hasattr(path, "read"):
-            warnings.warn(_deprecation_message)
-            ret = torchaudio.lib._torchaudio_sox.apply_effects_fileobj(path, effects, normalize, channels_first, format)
-            if ret is None:
-                raise RuntimeError("Failed to load audio from {}".format(path))
-            return ret
+            raise RuntimeError(
+                "apply_effects_file function does not support file-like object. "
+                "Please use torchaudio.io.AudioEffector."
+            )
         path = os.fspath(path)
-    ret = torch.ops.torchaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format)
-    if ret is not None:
-        return ret
-    raise RuntimeError("Failed to load audio from {}".format(path))
+    return torch.ops.torchaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format)

torchaudio/transforms/__init__.py CHANGED Viewed

@@ -23,6 +23,7 @@ from ._transforms import (
     Resample,
     RNNTLoss,
     SlidingWindowCmn,
+    SpecAugment,
     SpectralCentroid,
     Spectrogram,
     Speed,
@@ -62,6 +63,7 @@ __all__ = [
     "Resample",
     "SlidingWindowCmn",
     "SoudenMVDR",
+    "SpecAugment",
     "SpectralCentroid",
     "Spectrogram",
     "Speed",