PyPI - torchaudio - Versions diffs - 2.0.2__cp38-cp38-manylinux1_x86_64.whl → 2.1.1__cp38-cp38-manylinux1_x86_64.whl - Mend

torchaudio 2.0.2__cp38-cp38-manylinux1_x86_64.whl → 2.1.1__cp38-cp38-manylinux1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show

torchaudio/__init__.py +22 -3
torchaudio/_backend/__init__.py +55 -4
torchaudio/_backend/backend.py +53 -0
torchaudio/_backend/common.py +52 -0
torchaudio/_backend/ffmpeg.py +373 -0
torchaudio/_backend/soundfile.py +54 -0
torchaudio/_backend/soundfile_backend.py +457 -0
torchaudio/_backend/sox.py +91 -0
torchaudio/_backend/utils.py +81 -323
torchaudio/_extension/__init__.py +55 -36
torchaudio/_extension/utils.py +109 -17
torchaudio/_internal/__init__.py +4 -1
torchaudio/_internal/module_utils.py +37 -6
torchaudio/backend/__init__.py +7 -11
torchaudio/backend/_no_backend.py +24 -0
torchaudio/backend/_sox_io_backend.py +297 -0
torchaudio/backend/common.py +12 -52
torchaudio/backend/no_backend.py +11 -21
torchaudio/backend/soundfile_backend.py +11 -448
torchaudio/backend/sox_io_backend.py +11 -435
torchaudio/backend/utils.py +9 -18
torchaudio/datasets/__init__.py +2 -0
torchaudio/datasets/cmuarctic.py +1 -1
torchaudio/datasets/cmudict.py +61 -62
torchaudio/datasets/dr_vctk.py +1 -1
torchaudio/datasets/gtzan.py +1 -1
torchaudio/datasets/librilight_limited.py +1 -1
torchaudio/datasets/librispeech.py +1 -1
torchaudio/datasets/librispeech_biasing.py +189 -0
torchaudio/datasets/libritts.py +1 -1
torchaudio/datasets/ljspeech.py +1 -1
torchaudio/datasets/musdb_hq.py +1 -1
torchaudio/datasets/quesst14.py +1 -1
torchaudio/datasets/speechcommands.py +1 -1
torchaudio/datasets/tedlium.py +1 -1
torchaudio/datasets/vctk.py +1 -1
torchaudio/datasets/voxceleb1.py +1 -1
torchaudio/datasets/yesno.py +1 -1
torchaudio/functional/__init__.py +6 -2
torchaudio/functional/_alignment.py +128 -0
torchaudio/functional/filtering.py +69 -92
torchaudio/functional/functional.py +99 -148
torchaudio/io/__init__.py +4 -1
torchaudio/io/_effector.py +347 -0
torchaudio/io/_stream_reader.py +158 -90
torchaudio/io/_stream_writer.py +196 -10
torchaudio/lib/_torchaudio.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg4.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg5.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg6.so +0 -0
torchaudio/lib/_torchaudio_sox.so +0 -0
torchaudio/lib/libctc_prefix_decoder.so +0 -0
torchaudio/lib/libtorchaudio.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg4.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg5.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg6.so +0 -0
torchaudio/lib/libtorchaudio_sox.so +0 -0
torchaudio/lib/pybind11_prefixctc.so +0 -0
torchaudio/models/__init__.py +14 -0
torchaudio/models/decoder/__init__.py +22 -7
torchaudio/models/decoder/_ctc_decoder.py +123 -69
torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
torchaudio/models/rnnt_decoder.py +10 -14
torchaudio/models/squim/__init__.py +11 -0
torchaudio/models/squim/objective.py +326 -0
torchaudio/models/squim/subjective.py +150 -0
torchaudio/models/wav2vec2/components.py +6 -10
torchaudio/pipelines/__init__.py +9 -0
torchaudio/pipelines/_squim_pipeline.py +176 -0
torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
torchaudio/pipelines/_wav2vec2/impl.py +198 -68
torchaudio/pipelines/_wav2vec2/utils.py +120 -0
torchaudio/sox_effects/sox_effects.py +7 -30
torchaudio/transforms/__init__.py +2 -0
torchaudio/transforms/_transforms.py +99 -54
torchaudio/utils/download.py +2 -2
torchaudio/utils/ffmpeg_utils.py +20 -15
torchaudio/utils/sox_utils.py +8 -9
torchaudio/version.py +2 -2
torchaudio-2.1.1.dist-info/METADATA +113 -0
torchaudio-2.1.1.dist-info/RECORD +119 -0
torchaudio/io/_compat.py +0 -241
torchaudio/lib/_torchaudio_ffmpeg.so +0 -0
torchaudio/lib/flashlight_lib_text_decoder.so +0 -0
torchaudio/lib/flashlight_lib_text_dictionary.so +0 -0
torchaudio/lib/libflashlight-text.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg.so +0 -0
torchaudio-2.0.2.dist-info/METADATA +0 -26
torchaudio-2.0.2.dist-info/RECORD +0 -100
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/LICENSE +0 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/WHEEL +0 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/top_level.txt +0 -0

torchaudio/transforms/_transforms.py CHANGED Viewed

@@ -36,7 +36,7 @@ class Spectrogram(torch.nn.Module):
         window_fn (Callable[..., Tensor], optional): A function to create a window tensor
             that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
         power (float or None, optional): Exponent for the magnitude spectrogram,
-            (must be > 0) e.g., 1 for energy, 2 for power, etc.
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc.
             If None, then the complex spectrum is returned instead. (Default: ``2``)
         normalized (bool or str, optional): Whether to normalize by magnitude after stft. If input is str, choices are
             ``"window"`` and ``"frame_length"``, if specific normalization type is desirable. ``True`` maps to
@@ -227,7 +227,7 @@ class GriffinLim(torch.nn.Module):
         window_fn (Callable[..., Tensor], optional): A function to create a window tensor
             that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
         power (float, optional): Exponent for the magnitude spectrogram,
-            (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc. (Default: ``2``)
         wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
         momentum (float, optional): The momentum parameter for fast Griffin-Lim.
             Setting this to 0 recovers the original Griffin-Lim method.
@@ -420,7 +420,7 @@ class InverseMelScale(torch.nn.Module):
     .. devices:: CPU CUDA
     It minimizes the euclidian norm between the input mel-spectrogram and the product between
-    the estimated spectrogram and the filter banks using SGD.
+    the estimated spectrogram and the filter banks using `torch.linalg.lstsq`.
     Args:
         n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
@@ -428,13 +428,13 @@ class InverseMelScale(torch.nn.Module):
         sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
         f_min (float, optional): Minimum frequency. (Default: ``0.``)
         f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
-        max_iter (int, optional): Maximum number of optimization iterations. (Default: ``100000``)
-        tolerance_loss (float, optional): Value of loss to stop optimization at. (Default: ``1e-5``)
-        tolerance_change (float, optional): Difference in losses to stop optimization at. (Default: ``1e-8``)
-        sgdargs (dict or None, optional): Arguments for the SGD optimizer. (Default: ``None``)
         norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
             (area normalization). (Default: ``None``)
         mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+        driver (str, optional): Name of the LAPACK/MAGMA method to be used for `torch.lstsq`.
+            For CPU inputs the valid values are ``"gels"``, ``"gelsy"``, ``"gelsd"``, ``"gelss"``.
+            For CUDA input, the only valid driver is ``"gels"``, which assumes that A is full-rank.
+            (Default: ``"gels``)
     Example
         >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
@@ -449,10 +449,6 @@ class InverseMelScale(torch.nn.Module):
         "sample_rate",
         "f_min",
         "f_max",
-        "max_iter",
-        "tolerance_loss",
-        "tolerance_change",
-        "sgdargs",
     ]
     def __init__(
@@ -462,26 +458,23 @@ class InverseMelScale(torch.nn.Module):
         sample_rate: int = 16000,
         f_min: float = 0.0,
         f_max: Optional[float] = None,
-        max_iter: int = 100000,
-        tolerance_loss: float = 1e-5,
-        tolerance_change: float = 1e-8,
-        sgdargs: Optional[dict] = None,
         norm: Optional[str] = None,
         mel_scale: str = "htk",
+        driver: str = "gels",
     ) -> None:
         super(InverseMelScale, self).__init__()
         self.n_mels = n_mels
         self.sample_rate = sample_rate
         self.f_max = f_max or float(sample_rate // 2)
         self.f_min = f_min
-        self.max_iter = max_iter
-        self.tolerance_loss = tolerance_loss
-        self.tolerance_change = tolerance_change
-        self.sgdargs = sgdargs or {"lr": 0.1, "momentum": 0.9}
+        self.driver = driver
         if f_min > self.f_max:
             raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
+        if driver not in ["gels", "gelsy", "gelsd", "gelss"]:
+            raise ValueError(f'driver must be one of ["gels", "gelsy", "gelsd", "gelss"]. Found {driver}.')
         fb = F.melscale_fbanks(n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, norm, mel_scale)
         self.register_buffer("fb", fb)
@@ -499,34 +492,10 @@ class InverseMelScale(torch.nn.Module):
         n_mels, time = shape[-2], shape[-1]
         freq, _ = self.fb.size()  # (freq, n_mels)
-        melspec = melspec.transpose(-1, -2)
         if self.n_mels != n_mels:
             raise ValueError("Expected an input with {} mel bins. Found: {}".format(self.n_mels, n_mels))
-        specgram = torch.rand(
-            melspec.size()[0], time, freq, requires_grad=True, dtype=melspec.dtype, device=melspec.device
-        )
-        optim = torch.optim.SGD([specgram], **self.sgdargs)
-        loss = float("inf")
-        for _ in range(self.max_iter):
-            optim.zero_grad()
-            diff = melspec - specgram.matmul(self.fb)
-            new_loss = diff.pow(2).sum(axis=-1).mean()
-            # take sum over mel-frequency then average over other dimensions
-            # so that loss threshold is applied par unit timeframe
-            new_loss.backward()
-            optim.step()
-            specgram.data = specgram.data.clamp(min=0)
-            new_loss = new_loss.item()
-            if new_loss < self.tolerance_loss or abs(loss - new_loss) < self.tolerance_change:
-                break
-            loss = new_loss
-        specgram.requires_grad_(False)
-        specgram = specgram.clamp(min=0).transpose(-1, -2)
+        specgram = torch.relu(torch.linalg.lstsq(self.fb.transpose(-1, -2)[None], melspec, driver=self.driver).solution)
         # unpack batch
         specgram = specgram.view(shape[:-2] + (freq, time))
@@ -540,7 +509,7 @@ class MelSpectrogram(torch.nn.Module):
     .. properties:: Autograd TorchScript
-    This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and
+    This is a composition of :py:func:`torchaudio.transforms.Spectrogram`
     and :py:func:`torchaudio.transforms.MelScale`.
     Sources
@@ -560,7 +529,7 @@ class MelSpectrogram(torch.nn.Module):
         window_fn (Callable[..., Tensor], optional): A function to create a window tensor
             that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
         power (float, optional): Exponent for the magnitude spectrogram,
-            (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc. (Default: ``2``)
         normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
         wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
         center (bool, optional): whether to pad :attr:`waveform` on both sides so
@@ -1196,15 +1165,16 @@ class _AxisMasking(torch.nn.Module):
     Args:
         mask_param (int): Maximum possible length of the mask.
-        axis (int): What dimension the mask is applied on.
+        axis (int): What dimension the mask is applied on (assuming the tensor is 3D).
+            For frequency masking, axis = 1.
+            For time masking, axis = 2.
         iid_masks (bool): Applies iid masks to each of the examples in the batch dimension.
-            This option is applicable only when the input tensor is 4D.
+            This option is applicable only when the dimension of the input tensor is >= 3.
         p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
     """
     __constants__ = ["mask_param", "axis", "iid_masks", "p"]
     def __init__(self, mask_param: int, axis: int, iid_masks: bool, p: float = 1.0) -> None:
         super(_AxisMasking, self).__init__()
         self.mask_param = mask_param
         self.axis = axis
@@ -1221,10 +1191,14 @@ class _AxisMasking(torch.nn.Module):
             Tensor: Masked spectrogram of dimensions `(..., freq, time)`.
         """
         # if iid_masks flag marked and specgram has a batch dimension
-        if self.iid_masks and specgram.dim() == 4:
-            return F.mask_along_axis_iid(specgram, self.mask_param, mask_value, self.axis + 1, p=self.p)
+        # self.axis + specgram.dim() - 3 gives the time/frequency dimension (last two dimensions)
+        # for input tensor for which the dimension is not 3.
+        if self.iid_masks:
+            return F.mask_along_axis_iid(
+                specgram, self.mask_param, mask_value, self.axis + specgram.dim() - 3, p=self.p
+            )
         else:
-            return F.mask_along_axis(specgram, self.mask_param, mask_value, self.axis, p=self.p)
+            return F.mask_along_axis(specgram, self.mask_param, mask_value, self.axis + specgram.dim() - 3, p=self.p)
 class FrequencyMasking(_AxisMasking):
@@ -1241,7 +1215,7 @@ class FrequencyMasking(_AxisMasking):
             Indices uniformly sampled from [0, freq_mask_param).
         iid_masks (bool, optional): whether to apply different masks to each
             example/channel in the batch. (Default: ``False``)
-            This option is applicable only when the input tensor is 4D.
+            This option is applicable only when the input tensor >= 3D.
     Example
         >>> spectrogram = torchaudio.transforms.Spectrogram()
@@ -1275,7 +1249,7 @@ class TimeMasking(_AxisMasking):
             Indices uniformly sampled from [0, time_mask_param).
         iid_masks (bool, optional): whether to apply different masks to each
             example/channel in the batch. (Default: ``False``)
-            This option is applicable only when the input tensor is 4D.
+            This option is applicable only when the input tensor >= 3D.
         p (float, optional): maximum proportion of time steps that can be masked.
             Must be within range [0.0, 1.0]. (Default: 1.0)
@@ -1299,6 +1273,77 @@ class TimeMasking(_AxisMasking):
         super(TimeMasking, self).__init__(time_mask_param, 2, iid_masks, p=p)
+class SpecAugment(torch.nn.Module):
+    r"""Apply time and frequency masking to a spectrogram.
+    Args:
+        n_time_masks (int): Number of time masks. If its value is zero, no time masking will be applied.
+        time_mask_param (int): Maximum possible length of the time mask.
+        n_freq_masks (int): Number of frequency masks. If its value is zero, no frequency masking will be applied.
+        freq_mask_param (int): Maximum possible length of the frequency mask.
+        iid_masks (bool, optional): Applies iid masks to each of the examples in the batch dimension.
+            This option is applicable only when the input tensor is 4D. (Default: ``True``)
+        p (float, optional): maximum proportion of time steps that can be masked.
+            Must be within range [0.0, 1.0]. (Default: 1.0)
+        zero_masking (bool, optional): If ``True``, use 0 as the mask value,
+            else use mean of the input tensor. (Default: ``False``)
+    """
+    __constants__ = [
+        "n_time_masks",
+        "time_mask_param",
+        "n_freq_masks",
+        "freq_mask_param",
+        "iid_masks",
+        "p",
+        "zero_masking",
+    ]
+    def __init__(
+        self,
+        n_time_masks: int,
+        time_mask_param: int,
+        n_freq_masks: int,
+        freq_mask_param: int,
+        iid_masks: bool = True,
+        p: float = 1.0,
+        zero_masking: bool = False,
+    ) -> None:
+        super(SpecAugment, self).__init__()
+        self.n_time_masks = n_time_masks
+        self.time_mask_param = time_mask_param
+        self.n_freq_masks = n_freq_masks
+        self.freq_mask_param = freq_mask_param
+        self.iid_masks = iid_masks
+        self.p = p
+        self.zero_masking = zero_masking
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor): Tensor of shape `(..., freq, time)`.
+        Returns:
+            Tensor: Masked spectrogram of shape `(..., freq, time)`.
+        """
+        if self.zero_masking:
+            mask_value = 0.0
+        else:
+            mask_value = specgram.mean()
+        time_dim = specgram.dim() - 1
+        freq_dim = time_dim - 1
+        if specgram.dim() > 2 and self.iid_masks is True:
+            for _ in range(self.n_time_masks):
+                specgram = F.mask_along_axis_iid(specgram, self.time_mask_param, mask_value, time_dim, p=self.p)
+            for _ in range(self.n_freq_masks):
+                specgram = F.mask_along_axis_iid(specgram, self.freq_mask_param, mask_value, freq_dim, p=self.p)
+        else:
+            for _ in range(self.n_time_masks):
+                specgram = F.mask_along_axis(specgram, self.time_mask_param, mask_value, time_dim, p=self.p)
+            for _ in range(self.n_freq_masks):
+                specgram = F.mask_along_axis(specgram, self.freq_mask_param, mask_value, freq_dim, p=self.p)
+        return specgram
 class Loudness(torch.nn.Module):
     r"""Measure audio loudness according to the ITU-R BS.1770-4 recommendation.

torchaudio/utils/download.py CHANGED Viewed

@@ -5,7 +5,7 @@ from pathlib import Path
 from typing import Union
 import torch
+from torchaudio._internal import download_url_to_file
 _LG = logging.getLogger(__name__)
@@ -18,7 +18,7 @@ def _get_local_path(key):
 def _download(key, path, progress):
     url = f"https://download.pytorch.org/torchaudio/{key}"
-    torch.hub.download_url_to_file(url, path, progress=progress)
+    download_url_to_file(url, path, progress=progress)
 def _get_hash(path, hash, chunk_size=1028):

torchaudio/utils/ffmpeg_utils.py CHANGED Viewed

@@ -4,7 +4,6 @@ It affects functionalities in :py:mod:`torchaudio.io` (and indirectly :py:func:`
 """
 from typing import Dict, List, Tuple
-import torch
 import torchaudio
@@ -16,7 +15,7 @@ def get_versions() -> Dict[str, Tuple[int]]:
         dict: mapping from library names to version string,
             i.e. `"libavutil": (56, 22, 100)`.
     """
-    return torch.ops.torchaudio.ffmpeg_get_versions()
+    return torchaudio._extension._FFMPEG_EXT.get_versions()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -25,7 +24,7 @@ def get_log_level() -> int:
     See :py:func:`set_log_level` for the detailo.
     """
-    return torch.ops.torchaudio.ffmpeg_get_log_level()
+    return torchaudio._extension._FFMPEG_EXT.get_log_level()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -62,7 +61,7 @@ def set_log_level(level: int):
                   Extremely verbose debugging, useful for libav* development.
     """
-    torch.ops.torchaudio.ffmpeg_set_log_level(level)
+    torchaudio._extension._FFMPEG_EXT.set_log_level(level)
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -80,7 +79,7 @@ def get_demuxers() -> Dict[str, str]:
         ... aax: CRI AAX
         ... ac3: raw AC-3
     """
-    return torch.ops.torchaudio.ffmpeg_get_demuxers()
+    return torchaudio._extension._FFMPEG_EXT.get_demuxers()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -99,7 +98,7 @@ def get_muxers() -> Dict[str, str]:
         ... adx: CRI ADX
         ... aiff: Audio IFF
     """
-    return torch.ops.torchaudio.ffmpeg_get_muxers()
+    return torchaudio._extension._FFMPEG_EXT.get_muxers()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -118,7 +117,7 @@ def get_audio_decoders() -> Dict[str, str]:
         ... adx: CRI ADX
         ... aiff: Audio IFF
     """
-    return torch.ops.torchaudio.ffmpeg_get_audio_decoders()
+    return torchaudio._extension._FFMPEG_EXT.get_audio_decoders()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -138,7 +137,7 @@ def get_audio_encoders() -> Dict[str, str]:
         ... ac3_fixed: ATSC A/52A (AC-3)
         ... alac: ALAC (Apple Lossless Audio Codec)
     """
-    return torch.ops.torchaudio.ffmpeg_get_audio_encoders()
+    return torchaudio._extension._FFMPEG_EXT.get_audio_encoders()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -158,7 +157,7 @@ def get_video_decoders() -> Dict[str, str]:
         ... amv: AMV Video
         ... anm: Deluxe Paint Animation
     """
-    return torch.ops.torchaudio.ffmpeg_get_video_decoders()
+    return torchaudio._extension._FFMPEG_EXT.get_video_decoders()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -179,7 +178,7 @@ def get_video_encoders() -> Dict[str, str]:
         ... asv1: ASUS V1
         ... asv2: ASUS V2
     """
-    return torch.ops.torchaudio.ffmpeg_get_video_encoders()
+    return torchaudio._extension._FFMPEG_EXT.get_video_encoders()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -195,7 +194,7 @@ def get_input_devices() -> Dict[str, str]:
         ... avfoundation: AVFoundation input device
         ... lavfi: Libavfilter virtual input device
     """
-    return torch.ops.torchaudio.ffmpeg_get_input_devices()
+    return torchaudio._extension._FFMPEG_EXT.get_input_devices()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -210,7 +209,7 @@ def get_output_devices() -> Dict[str, str]:
         >>>     print(f"{k}: {v}")
         ... audiotoolbox: AudioToolbox output device
     """
-    return torch.ops.torchaudio.ffmpeg_get_output_devices()
+    return torchaudio._extension._FFMPEG_EXT.get_output_devices()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -224,7 +223,7 @@ def get_input_protocols() -> List[str]:
         >>> print(get_input_protocols())
         ... ['file', 'ftp', 'hls', 'http','https', 'pipe', 'rtmp', 'tcp', 'tls', 'udp', 'unix']
     """
-    return torch.ops.torchaudio.ffmpeg_get_input_protocols()
+    return torchaudio._extension._FFMPEG_EXT.get_input_protocols()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -238,7 +237,7 @@ def get_output_protocols() -> List[str]:
         >>> print(get_output_protocols())
         ... ['file', 'ftp', 'http', 'https', 'md5', 'pipe', 'prompeg', 'rtmp', 'tee', 'tcp', 'tls', 'udp', 'unix']
     """
-    return torch.ops.torchaudio.ffmpeg_get_output_protocols()
+    return torchaudio._extension._FFMPEG_EXT.get_output_protocols()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -252,4 +251,10 @@ def get_build_config() -> str:
         >>> print(get_build_config())
         --prefix=/Users/runner/miniforge3 --cc=arm64-apple-darwin20.0.0-clang --enable-gpl --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-neon --enable-libx264 --enable-libx265 --enable-libaom --enable-libsvtav1 --enable-libxml2 --enable-libvpx --enable-pic --enable-pthreads --enable-shared --disable-static --enable-version3 --enable-zlib --enable-libmp3lame --pkg-config=/Users/runner/miniforge3/conda-bld/ffmpeg_1646229390493/_build_env/bin/pkg-config --enable-cross-compile --arch=arm64 --target-os=darwin --cross-prefix=arm64-apple-darwin20.0.0- --host-cc=/Users/runner/miniforge3/conda-bld/ffmpeg_1646229390493/_build_env/bin/x86_64-apple-darwin13.4.0-clang  # noqa
     """
-    return torch.ops.torchaudio.ffmpeg_get_build_config()
+    return torchaudio._extension._FFMPEG_EXT.get_build_config()
+@torchaudio._extension.fail_if_no_ffmpeg
+def clear_cuda_context_cache():
+    """Clear the CUDA context used by CUDA Hardware accelerated video decoding"""
+    torchaudio._extension._FFMPEG_EXT.clear_cuda_context_cache()

torchaudio/utils/sox_utils.py CHANGED Viewed

@@ -4,7 +4,6 @@
 from typing import Dict, List
-import torch
 import torchaudio
@@ -18,7 +17,7 @@ def set_seed(seed: int):
     See Also:
         http://sox.sourceforge.net/sox.html
     """
-    torch.ops.torchaudio.sox_utils_set_seed(seed)
+    torchaudio.lib._torchaudio_sox.set_seed(seed)
 @torchaudio._extension.fail_if_no_sox
@@ -36,7 +35,7 @@ def set_verbosity(verbosity: int):
     See Also:
         http://sox.sourceforge.net/sox.html
     """
-    torch.ops.torchaudio.sox_utils_set_verbosity(verbosity)
+    torchaudio.lib._torchaudio_sox.set_verbosity(verbosity)
 @torchaudio._extension.fail_if_no_sox
@@ -49,7 +48,7 @@ def set_buffer_size(buffer_size: int):
     See Also:
         http://sox.sourceforge.net/sox.html
     """
-    torch.ops.torchaudio.sox_utils_set_buffer_size(buffer_size)
+    torchaudio.lib._torchaudio_sox.set_buffer_size(buffer_size)
 @torchaudio._extension.fail_if_no_sox
@@ -63,7 +62,7 @@ def set_use_threads(use_threads: bool):
     See Also:
         http://sox.sourceforge.net/sox.html
     """
-    torch.ops.torchaudio.sox_utils_set_use_threads(use_threads)
+    torchaudio.lib._torchaudio_sox.set_use_threads(use_threads)
 @torchaudio._extension.fail_if_no_sox
@@ -73,7 +72,7 @@ def list_effects() -> Dict[str, str]:
     Returns:
         Dict[str, str]: Mapping from ``effect name`` to ``usage``
     """
-    return dict(torch.ops.torchaudio.sox_utils_list_effects())
+    return dict(torchaudio.lib._torchaudio_sox.list_effects())
 @torchaudio._extension.fail_if_no_sox
@@ -83,7 +82,7 @@ def list_read_formats() -> List[str]:
     Returns:
         List[str]: List of supported audio formats
     """
-    return torch.ops.torchaudio.sox_utils_list_read_formats()
+    return torchaudio.lib._torchaudio_sox.list_read_formats()
 @torchaudio._extension.fail_if_no_sox
@@ -93,7 +92,7 @@ def list_write_formats() -> List[str]:
     Returns:
         List[str]: List of supported audio formats
     """
-    return torch.ops.torchaudio.sox_utils_list_write_formats()
+    return torchaudio.lib._torchaudio_sox.list_write_formats()
 @torchaudio._extension.fail_if_no_sox
@@ -103,4 +102,4 @@ def get_buffer_size() -> int:
     Returns:
         int: size in bytes of buffers used for processing audio.
     """
-    return torch.ops.torchaudio.sox_utils_get_buffer_size()
+    return torchaudio.lib._torchaudio_sox.get_buffer_size()

torchaudio/version.py CHANGED Viewed

@@ -1,2 +1,2 @@
-__version__ = '2.0.2+cu117'
-git_version = '31de77dad5c89274451b3f5c4bcb630be12787c4'
+__version__ = '2.1.1+cu121'
+git_version = '5784206b90d738de888dce4c99b8b46be213f019'

torchaudio-2.1.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,113 @@
+Metadata-Version: 2.1
+Name: torchaudio
+Version: 2.1.1
+Summary: An audio package for PyTorch
+Home-page: https://github.com/pytorch/audio
+Author: Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang
+Author-email: soumith@pytorch.org
+Maintainer: Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang
+Maintainer-email: moto@meta.com
+Classifier: Environment :: Plugins
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: Microsoft :: Windows
+Classifier: Operating System :: POSIX
+Classifier: Programming Language :: C++
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Topic :: Multimedia :: Sound/Audio
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch (==2.1.1)
+torchaudio: an audio library for PyTorch
+========================================
+[![Documentation](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchaudio%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/audio/main/)
+[![Anaconda Badge](https://anaconda.org/pytorch/torchaudio/badges/downloads.svg)](https://anaconda.org/pytorch/torchaudio)
+[![Anaconda-Server Badge](https://anaconda.org/pytorch/torchaudio/badges/platforms.svg)](https://anaconda.org/pytorch/torchaudio)
+![TorchAudio Logo](docs/source/_static/img/logo.png)
+The aim of torchaudio is to apply [PyTorch](https://github.com/pytorch/pytorch) to
+the audio domain. By supporting PyTorch, torchaudio follows the same philosophy
+of providing strong GPU acceleration, having a focus on trainable features through
+the autograd system, and having consistent style (tensor names and dimension names).
+Therefore, it is primarily a machine learning library and not a general signal
+processing library. The benefits of PyTorch can be seen in torchaudio through
+having all the computations be through PyTorch operations which makes it easy
+to use and feel like a natural extension.
+- [Support audio I/O (Load files, Save files)](http://pytorch.org/audio/main/)
+  - Load a variety of audio formats, such as `wav`, `mp3`, `ogg`, `flac`, `opus`, `sphere`, into a torch Tensor using SoX
+  - [Kaldi (ark/scp)](http://pytorch.org/audio/main/kaldi_io.html)
+- [Dataloaders for common audio datasets](http://pytorch.org/audio/main/datasets.html)
+- Audio and speech processing functions
+  - [forced_align](https://pytorch.org/audio/main/generated/torchaudio.functional.forced_align.html)
+- Common audio transforms
+  - [Spectrogram, AmplitudeToDB, MelScale, MelSpectrogram, MFCC, MuLawEncoding, MuLawDecoding, Resample](http://pytorch.org/audio/main/transforms.html)
+- Compliance interfaces: Run code using PyTorch that align with other libraries
+  - [Kaldi: spectrogram, fbank, mfcc](https://pytorch.org/audio/main/compliance.kaldi.html)
+Installation
+------------
+Please refer to https://pytorch.org/audio/main/installation.html for installation and build process of TorchAudio.
+API Reference
+-------------
+API Reference is located here: http://pytorch.org/audio/main/
+Contributing Guidelines
+-----------------------
+Please refer to [CONTRIBUTING.md](./CONTRIBUTING.md)
+Citation
+--------
+If you find this package useful, please cite as:
+```bibtex
+@article{yang2021torchaudio,
+  title={TorchAudio: Building Blocks for Audio and Speech Processing},
+  author={Yao-Yuan Yang and Moto Hira and Zhaoheng Ni and Anjali Chourdia and Artyom Astafurov and Caroline Chen and Ching-Feng Yeh and Christian Puhrsch and David Pollack and Dmitriy Genzel and Donny Greenberg and Edward Z. Yang and Jason Lian and Jay Mahadeokar and Jeff Hwang and Ji Chen and Peter Goldsborough and Prabhat Roy and Sean Narenthiran and Shinji Watanabe and Soumith Chintala and Vincent Quenneville-Bélair and Yangyang Shi},
+  journal={arXiv preprint arXiv:2110.15018},
+  year={2021}
+}
+```
+```bibtex
+@misc{hwang2023torchaudio,
+      title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch},
+      author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao and Robin Scheibler and Samuele Cornell and Sean Kim and Stavros Petridis},
+      year={2023},
+      eprint={2310.17864},
+      archivePrefix={arXiv},
+      primaryClass={eess.AS}
+}
+```
+Disclaimer on Datasets
+----------------------
+This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license.
+If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML community!
+Pre-trained Model License
+-------------------------
+The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the dataset used for training. It is your responsibility to determine whether you have permission to use the models for your use case.
+For instance, SquimSubjective model is released under the Creative Commons Attribution Non Commercial 4.0 International (CC-BY-NC 4.0) license. See [the link](https://zenodo.org/record/4660670#.ZBtWPOxuerN) for additional details.
+Other pre-trained models that have different license are noted in documentation. Please checkout the [documentation page](https://pytorch.org/audio/main/).