PyPI - torchaudio - Versions diffs - 2.0.2__cp38-cp38-manylinux2014_aarch64.whl → 2.1.1__cp38-cp38-manylinux2014_aarch64.whl - Mend

torchaudio 2.0.2__cp38-cp38-manylinux2014_aarch64.whl → 2.1.1__cp38-cp38-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchaudio might be problematic. Click here for more details.

Files changed (90) hide show

torchaudio/__init__.py +22 -3
torchaudio/_backend/__init__.py +55 -4
torchaudio/_backend/backend.py +53 -0
torchaudio/_backend/common.py +52 -0
torchaudio/_backend/ffmpeg.py +373 -0
torchaudio/_backend/soundfile.py +54 -0
torchaudio/_backend/soundfile_backend.py +457 -0
torchaudio/_backend/sox.py +91 -0
torchaudio/_backend/utils.py +81 -323
torchaudio/_extension/__init__.py +55 -36
torchaudio/_extension/utils.py +109 -17
torchaudio/_internal/__init__.py +4 -1
torchaudio/_internal/module_utils.py +37 -6
torchaudio/backend/__init__.py +7 -11
torchaudio/backend/_no_backend.py +24 -0
torchaudio/backend/_sox_io_backend.py +297 -0
torchaudio/backend/common.py +12 -52
torchaudio/backend/no_backend.py +11 -21
torchaudio/backend/soundfile_backend.py +11 -448
torchaudio/backend/sox_io_backend.py +11 -435
torchaudio/backend/utils.py +9 -18
torchaudio/datasets/__init__.py +2 -0
torchaudio/datasets/cmuarctic.py +1 -1
torchaudio/datasets/cmudict.py +61 -62
torchaudio/datasets/dr_vctk.py +1 -1
torchaudio/datasets/gtzan.py +1 -1
torchaudio/datasets/librilight_limited.py +1 -1
torchaudio/datasets/librispeech.py +1 -1
torchaudio/datasets/librispeech_biasing.py +189 -0
torchaudio/datasets/libritts.py +1 -1
torchaudio/datasets/ljspeech.py +1 -1
torchaudio/datasets/musdb_hq.py +1 -1
torchaudio/datasets/quesst14.py +1 -1
torchaudio/datasets/speechcommands.py +1 -1
torchaudio/datasets/tedlium.py +1 -1
torchaudio/datasets/vctk.py +1 -1
torchaudio/datasets/voxceleb1.py +1 -1
torchaudio/datasets/yesno.py +1 -1
torchaudio/functional/__init__.py +6 -2
torchaudio/functional/_alignment.py +128 -0
torchaudio/functional/filtering.py +69 -92
torchaudio/functional/functional.py +99 -148
torchaudio/io/__init__.py +4 -1
torchaudio/io/_effector.py +347 -0
torchaudio/io/_stream_reader.py +158 -90
torchaudio/io/_stream_writer.py +196 -10
torchaudio/lib/_torchaudio.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg4.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg5.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg6.so +0 -0
torchaudio/lib/_torchaudio_sox.so +0 -0
torchaudio/lib/libtorchaudio.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg4.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg5.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg6.so +0 -0
torchaudio/lib/libtorchaudio_sox.so +0 -0
torchaudio/models/__init__.py +14 -0
torchaudio/models/decoder/__init__.py +22 -7
torchaudio/models/decoder/_ctc_decoder.py +123 -69
torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
torchaudio/models/rnnt_decoder.py +10 -14
torchaudio/models/squim/__init__.py +11 -0
torchaudio/models/squim/objective.py +326 -0
torchaudio/models/squim/subjective.py +150 -0
torchaudio/models/wav2vec2/components.py +6 -10
torchaudio/pipelines/__init__.py +9 -0
torchaudio/pipelines/_squim_pipeline.py +176 -0
torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
torchaudio/pipelines/_wav2vec2/impl.py +198 -68
torchaudio/pipelines/_wav2vec2/utils.py +120 -0
torchaudio/sox_effects/sox_effects.py +7 -30
torchaudio/transforms/__init__.py +2 -0
torchaudio/transforms/_transforms.py +99 -54
torchaudio/utils/download.py +2 -2
torchaudio/utils/ffmpeg_utils.py +20 -15
torchaudio/utils/sox_utils.py +8 -9
torchaudio/version.py +2 -2
torchaudio-2.1.1.dist-info/METADATA +113 -0
torchaudio-2.1.1.dist-info/RECORD +117 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/WHEEL +1 -1
torchaudio/io/_compat.py +0 -241
torchaudio/lib/_torchaudio_ffmpeg.so +0 -0
torchaudio/lib/flashlight_lib_text_decoder.so +0 -0
torchaudio/lib/flashlight_lib_text_dictionary.so +0 -0
torchaudio/lib/libflashlight-text.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg.so +0 -0
torchaudio-2.0.2.dist-info/METADATA +0 -26
torchaudio-2.0.2.dist-info/RECORD +0 -100
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/LICENSE +0 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/top_level.txt +0 -0

torchaudio/functional/filtering.py CHANGED Viewed

@@ -1390,11 +1390,11 @@ def _measure(
     cepstrum_end: int,
     noise_reduction_amount: float,
     measure_smooth_time_mult: float,
-    noise_up_time_mult: float,
-    noise_down_time_mult: float,
-    index_ns: int,
+    noise_up_time_mult: Tensor,
+    noise_down_time_mult: Tensor,
     boot_count: int,
 ) -> float:
+    device = samples.device
     if spectrum.size(-1) != noise_spectrum.size(-1):
         raise ValueError(
@@ -1402,37 +1402,29 @@ def _measure(
             f"Found: spectrum size: {spectrum.size()}, noise_spectrum size: {noise_spectrum.size()}"
         )
-    samplesLen_ns = samples.size()[-1]
     dft_len_ws = spectrum.size()[-1]
-    dftBuf = torch.zeros(dft_len_ws)
+    dftBuf = torch.zeros(dft_len_ws, device=device)
-    _index_ns = torch.tensor([index_ns] + [(index_ns + i) % samplesLen_ns for i in range(1, measure_len_ws)])
-    dftBuf[:measure_len_ws] = samples[_index_ns] * spectrum_window[:measure_len_ws]
-    # memset(c->dftBuf + i, 0, (p->dft_len_ws - i) * sizeof(*c->dftBuf));
-    dftBuf[measure_len_ws:dft_len_ws].zero_()
+    dftBuf[:measure_len_ws] = samples * spectrum_window[:measure_len_ws]
     # lsx_safe_rdft((int)p->dft_len_ws, 1, c->dftBuf);
     _dftBuf = torch.fft.rfft(dftBuf)
-    # memset(c->dftBuf, 0, p->spectrum_start * sizeof(*c->dftBuf));
-    _dftBuf[:spectrum_start].zero_()
     mult: float = boot_count / (1.0 + boot_count) if boot_count >= 0 else measure_smooth_time_mult
     _d = _dftBuf[spectrum_start:spectrum_end].abs()
     spectrum[spectrum_start:spectrum_end].mul_(mult).add_(_d * (1 - mult))
     _d = spectrum[spectrum_start:spectrum_end] ** 2
-    _zeros = torch.zeros(spectrum_end - spectrum_start)
+    _zeros = torch.zeros(spectrum_end - spectrum_start, device=device)
     _mult = (
         _zeros
         if boot_count >= 0
         else torch.where(
             _d > noise_spectrum[spectrum_start:spectrum_end],
-            torch.tensor(noise_up_time_mult),  # if
-            torch.tensor(noise_down_time_mult),  # else
+            noise_up_time_mult,  # if
+            noise_down_time_mult,  # else,
         )
     )
@@ -1441,10 +1433,10 @@ def _measure(
         torch.max(
             _zeros,
             _d - noise_reduction_amount * noise_spectrum[spectrum_start:spectrum_end],
-        )
+        ),
     )
-    _cepstrum_Buf: Tensor = torch.zeros(dft_len_ws >> 1)
+    _cepstrum_Buf: Tensor = torch.zeros(dft_len_ws >> 1, device=device)
     _cepstrum_Buf[spectrum_start:spectrum_end] = _d * cepstrum_window
     _cepstrum_Buf[spectrum_end : dft_len_ws >> 1].zero_()
@@ -1539,6 +1531,7 @@ def vad(
     Reference:
         - http://sox.sourceforge.net/sox.html
     """
+    device = waveform.device
     if waveform.ndim > 2:
         warnings.warn(
@@ -1566,23 +1559,23 @@ def vad(
     fixed_pre_trigger_len_ns = int(pre_trigger_time * sample_rate + 0.5)
     samplesLen_ns = fixed_pre_trigger_len_ns + search_pre_trigger_len_ns + measure_len_ns
-    spectrum_window = torch.zeros(measure_len_ws)
+    spectrum_window = torch.zeros(measure_len_ws, device=device)
     for i in range(measure_len_ws):
         # sox.h:741 define SOX_SAMPLE_MIN (sox_sample_t)SOX_INT_MIN(32)
         spectrum_window[i] = 2.0 / math.sqrt(float(measure_len_ws))
     # lsx_apply_hann(spectrum_window, (int)measure_len_ws);
-    spectrum_window *= torch.hann_window(measure_len_ws, dtype=torch.float)
+    spectrum_window *= torch.hann_window(measure_len_ws, device=device, dtype=torch.float)
     spectrum_start: int = int(hp_filter_freq / sample_rate * dft_len_ws + 0.5)
     spectrum_start: int = max(spectrum_start, 1)
     spectrum_end: int = int(lp_filter_freq / sample_rate * dft_len_ws + 0.5)
     spectrum_end: int = min(spectrum_end, dft_len_ws // 2)
-    cepstrum_window = torch.zeros(spectrum_end - spectrum_start)
+    cepstrum_window = torch.zeros(spectrum_end - spectrum_start, device=device)
     for i in range(spectrum_end - spectrum_start):
         cepstrum_window[i] = 2.0 / math.sqrt(float(spectrum_end) - spectrum_start)
     # lsx_apply_hann(cepstrum_window,(int)(spectrum_end - spectrum_start));
-    cepstrum_window *= torch.hann_window(spectrum_end - spectrum_start, dtype=torch.float)
+    cepstrum_window *= torch.hann_window(spectrum_end - spectrum_start, device=device, dtype=torch.float)
     cepstrum_start = math.ceil(sample_rate * 0.5 / lp_lifter_freq)
     cepstrum_end = math.floor(sample_rate * 0.5 / hp_lifter_freq)
@@ -1594,14 +1587,13 @@ def vad(
             f"Found: cepstrum_start: {cepstrum_start}, cepstrum_end: {cepstrum_end}."
         )
-    noise_up_time_mult = math.exp(-1.0 / (noise_up_time * measure_freq))
-    noise_down_time_mult = math.exp(-1.0 / (noise_down_time * measure_freq))
+    noise_up_time_mult = torch.tensor(math.exp(-1.0 / (noise_up_time * measure_freq)), device=device)
+    noise_down_time_mult = torch.tensor(math.exp(-1.0 / (noise_down_time * measure_freq)), device=device)
     measure_smooth_time_mult = math.exp(-1.0 / (measure_smooth_time * measure_freq))
     trigger_meas_time_mult = math.exp(-1.0 / (trigger_time * measure_freq))
     boot_count_max = int(boot_time * measure_freq - 0.5)
-    measure_timer_ns = measure_len_ns
-    boot_count = measures_index = flushedLen_ns = samplesIndex_ns = 0
+    boot_count = measures_index = flushedLen_ns = 0
     # pack batch
     shape = waveform.size()
@@ -1609,80 +1601,65 @@ def vad(
     n_channels, ilen = waveform.size()
-    mean_meas = torch.zeros(n_channels)
-    samples = torch.zeros(n_channels, samplesLen_ns)
-    spectrum = torch.zeros(n_channels, dft_len_ws)
-    noise_spectrum = torch.zeros(n_channels, dft_len_ws)
-    measures = torch.zeros(n_channels, measures_len)
+    mean_meas = torch.zeros(n_channels, device=device)
+    spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
+    noise_spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
+    measures = torch.zeros(n_channels, measures_len, device=device)
     has_triggered: bool = False
     num_measures_to_flush: int = 0
-    pos: int = 0
-    while pos < ilen and not has_triggered:
-        measure_timer_ns -= 1
+    pos = 0
+    for pos in range(measure_len_ns, ilen, measure_period_ns):
         for i in range(n_channels):
-            samples[i, samplesIndex_ns] = waveform[i, pos]
-            # if (!p->measure_timer_ns) {
-            if measure_timer_ns == 0:
-                index_ns: int = (samplesIndex_ns + samplesLen_ns - measure_len_ns) % samplesLen_ns
-                meas: float = _measure(
-                    measure_len_ws=measure_len_ws,
-                    samples=samples[i],
-                    spectrum=spectrum[i],
-                    noise_spectrum=noise_spectrum[i],
-                    spectrum_window=spectrum_window,
-                    spectrum_start=spectrum_start,
-                    spectrum_end=spectrum_end,
-                    cepstrum_window=cepstrum_window,
-                    cepstrum_start=cepstrum_start,
-                    cepstrum_end=cepstrum_end,
-                    noise_reduction_amount=noise_reduction_amount,
-                    measure_smooth_time_mult=measure_smooth_time_mult,
-                    noise_up_time_mult=noise_up_time_mult,
-                    noise_down_time_mult=noise_down_time_mult,
-                    index_ns=index_ns,
-                    boot_count=boot_count,
-                )
-                measures[i, measures_index] = meas
-                mean_meas[i] = mean_meas[i] * trigger_meas_time_mult + meas * (1.0 - trigger_meas_time_mult)
-                has_triggered = has_triggered or (mean_meas[i] >= trigger_level)
-                if has_triggered:
-                    n: int = measures_len
-                    k: int = measures_index
-                    jTrigger: int = n
-                    jZero: int = n
-                    j: int = 0
-                    for j in range(n):
-                        if (measures[i, k] >= trigger_level) and (j <= jTrigger + gap_len):
-                            jZero = jTrigger = j
-                        elif (measures[i, k] == 0) and (jTrigger >= jZero):
-                            jZero = j
-                        k = (k + n - 1) % n
-                    j = min(j, jZero)
-                    # num_measures_to_flush = range_limit(j, num_measures_to_flush, n);
-                    num_measures_to_flush = min(max(num_measures_to_flush, j), n)
-                # end if has_triggered
-            # end if (measure_timer_ns == 0):
-        # end for
-        samplesIndex_ns += 1
-        pos += 1
-        # end while
-        if samplesIndex_ns == samplesLen_ns:
-            samplesIndex_ns = 0
-        if measure_timer_ns == 0:
-            measure_timer_ns = measure_period_ns
-            measures_index += 1
-            measures_index = measures_index % measures_len
-            if boot_count >= 0:
-                boot_count = -1 if boot_count == boot_count_max else boot_count + 1
+            meas: float = _measure(
+                measure_len_ws=measure_len_ws,
+                samples=waveform[i, pos - measure_len_ws : pos],
+                spectrum=spectrum[i],
+                noise_spectrum=noise_spectrum[i],
+                spectrum_window=spectrum_window,
+                spectrum_start=spectrum_start,
+                spectrum_end=spectrum_end,
+                cepstrum_window=cepstrum_window,
+                cepstrum_start=cepstrum_start,
+                cepstrum_end=cepstrum_end,
+                noise_reduction_amount=noise_reduction_amount,
+                measure_smooth_time_mult=measure_smooth_time_mult,
+                noise_up_time_mult=noise_up_time_mult,
+                noise_down_time_mult=noise_down_time_mult,
+                boot_count=boot_count,
+            )
+            measures[i, measures_index] = meas
+            mean_meas[i] = mean_meas[i] * trigger_meas_time_mult + meas * (1.0 - trigger_meas_time_mult)
+            has_triggered = has_triggered or (mean_meas[i] >= trigger_level)
+            if has_triggered:
+                n: int = measures_len
+                k: int = measures_index
+                jTrigger: int = n
+                jZero: int = n
+                j: int = 0
+                for j in range(n):
+                    if (measures[i, k] >= trigger_level) and (j <= jTrigger + gap_len):
+                        jZero = jTrigger = j
+                    elif (measures[i, k] == 0) and (jTrigger >= jZero):
+                        jZero = j
+                    k = (k + n - 1) % n
+                j = min(j, jZero)
+                # num_measures_to_flush = range_limit(j, num_measures_to_flush, n);
+                num_measures_to_flush = min(max(num_measures_to_flush, j), n)
+            # end if has_triggered
+        # end for channel
+        measures_index += 1
+        measures_index = measures_index % measures_len
+        if boot_count >= 0:
+            boot_count = -1 if boot_count == boot_count_max else boot_count + 1
         if has_triggered:
             flushedLen_ns = (measures_len - num_measures_to_flush) * measure_period_ns
-            samplesIndex_ns = (samplesIndex_ns + flushedLen_ns) % samplesLen_ns
+            break
+    # end for window
     res = waveform[:, pos - samplesLen_ns + flushedLen_ns :]
     # unpack batch
     return res.view(shape[:-1] + res.shape[-1:])

torchaudio/functional/functional.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
-import io
 import math
+import tempfile
 import warnings
 from collections.abc import Sequence
 from typing import List, Optional, Tuple, Union
@@ -9,6 +9,7 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torchaudio
 from torch import Tensor
+from torchaudio._internal.module_utils import deprecated
 from .filtering import highpass_biquad, treble_biquad
@@ -19,7 +20,6 @@ __all__ = [
     "amplitude_to_DB",
     "DB_to_amplitude",
     "compute_deltas",
-    "compute_kaldi_pitch",
     "melscale_fbanks",
     "linear_fbanks",
     "create_dct",
@@ -83,7 +83,7 @@ def spectrogram(
         hop_length (int): Length of hop between STFT windows
         win_length (int): Window size
         power (float or None): Exponent for the magnitude spectrogram,
-            (must be > 0) e.g., 1 for energy, 2 for power, etc.
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc.
             If None, then the complex spectrum is returned instead.
         normalized (bool or str): Whether to normalize by magnitude after stft. If input is str, choices are
             ``"window"`` and ``"frame_length"``, if specific normalization type is desirable. ``True`` maps to
@@ -286,7 +286,7 @@ def griffinlim(
             Default: ``win_length // 2``)
         win_length (int): Window size. (Default: ``n_fft``)
         power (float): Exponent for the magnitude spectrogram,
-            (must be > 0) e.g., 1 for energy, 2 for power, etc.
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc.
         n_iter (int): Number of iteration for phase recovery process.
         momentum (float): The momentum parameter for fast Griffin-Lim.
             Setting this to 0 recovers the original Griffin-Lim method.
@@ -370,9 +370,17 @@ def amplitude_to_DB(
     Args:
-        x (Tensor): Input spectrogram(s) before being converted to decibel scale. Input should take
-          the form `(..., freq, time)`. Batched inputs should include a channel dimension and
-          have the form `(batch, channel, freq, time)`.
+        x (Tensor): Input spectrogram(s) before being converted to decibel scale.
+            The expected shapes are ``(freq, time)``, ``(channel, freq, time)`` or
+            ``(..., batch, channel, freq, time)``.
+            .. note::
+               When ``top_db`` is specified, cut-off values are computed for each audio
+               in the batch. Therefore if the input shape is 4D (or larger), different
+               cut-off values are used for audio data in the batch.
+               If the input shape is 2D or 3D, a single cutoff value is used.
         multiplier (float): Use 10. for power and 20. for amplitude
         amin (float): Number to clamp ``x``
         db_multiplier (float): Log10(max(reference value and amin))
@@ -547,7 +555,7 @@ def melscale_fbanks(
         meaning number of frequencies to highlight/apply to x the number of filterbanks.
         Each column is a filterbank so that assuming there is a matrix A of
         size (..., ``n_freqs``), the applied result would be
-        ``A * melscale_fbanks(A.size(-1), ...)``.
+        ``A @ melscale_fbanks(A.size(-1), ...)``.
     """
@@ -825,18 +833,25 @@ def mask_along_axis_iid(
     ``max_v = min(mask_param, floor(specgrams.size(axis) * p))`` otherwise.
     Args:
-        specgrams (Tensor): Real spectrograms `(batch, channel, freq, time)`
+        specgrams (Tensor): Real spectrograms `(..., freq, time)`, with at least 3 dimensions.
         mask_param (int): Number of columns to be masked will be uniformly sampled from [0, mask_param]
         mask_value (float): Value to assign to the masked columns
-        axis (int): Axis to apply masking on (2 -> frequency, 3 -> time)
+        axis (int): Axis to apply masking on, which should be the one of the last two dimensions.
         p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
     Returns:
-        Tensor: Masked spectrograms of dimensions `(batch, channel, freq, time)`
+        Tensor: Masked spectrograms with the same dimensions as input specgrams Tensor`
     """
-    if axis not in [2, 3]:
-        raise ValueError("Only Frequency and Time masking are supported")
+    dim = specgrams.dim()
+    if dim < 3:
+        raise ValueError(f"Spectrogram must have at least three dimensions ({dim} given).")
+    if axis not in [dim - 2, dim - 1]:
+        raise ValueError(
+            f"Only Frequency and Time masking are supported (axis {dim-2} and axis {dim-1} supported; {axis} given)."
+        )
     if not 0.0 <= p <= 1.0:
         raise ValueError(f"The value of p must be between 0.0 and 1.0 ({p} given).")
@@ -848,8 +863,8 @@ def mask_along_axis_iid(
     device = specgrams.device
     dtype = specgrams.dtype
-    value = torch.rand(specgrams.shape[:2], device=device, dtype=dtype) * mask_param
-    min_value = torch.rand(specgrams.shape[:2], device=device, dtype=dtype) * (specgrams.size(axis) - value)
+    value = torch.rand(specgrams.shape[: (dim - 2)], device=device, dtype=dtype) * mask_param
+    min_value = torch.rand(specgrams.shape[: (dim - 2)], device=device, dtype=dtype) * (specgrams.size(axis) - value)
     # Create broadcastable mask
     mask_start = min_value.long()[..., None, None]
@@ -879,24 +894,31 @@ def mask_along_axis(
     Mask will be applied from indices ``[v_0, v_0 + v)``,
     where ``v`` is sampled from ``uniform(0, max_v)`` and
-    ``v_0`` from ``uniform(0, specgrams.size(axis) - v)``, with
+    ``v_0`` from ``uniform(0, specgram.size(axis) - v)``, with
     ``max_v = mask_param`` when ``p = 1.0`` and
-    ``max_v = min(mask_param, floor(specgrams.size(axis) * p))``
+    ``max_v = min(mask_param, floor(specgram.size(axis) * p))``
     otherwise.
     All examples will have the same mask interval.
     Args:
-        specgram (Tensor): Real spectrogram `(channel, freq, time)`
+        specgram (Tensor): Real spectrograms `(..., freq, time)`, with at least 2 dimensions.
         mask_param (int): Number of columns to be masked will be uniformly sampled from [0, mask_param]
         mask_value (float): Value to assign to the masked columns
-        axis (int): Axis to apply masking on (1 -> frequency, 2 -> time)
+        axis (int): Axis to apply masking on, which should be the one of the last two dimensions.
         p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
     Returns:
-        Tensor: Masked spectrogram of dimensions `(channel, freq, time)`
+        Tensor: Masked spectrograms with the same dimensions as input specgram Tensor
     """
-    if axis not in [1, 2]:
-        raise ValueError("Only Frequency and Time masking are supported")
+    dim = specgram.dim()
+    if dim < 2:
+        raise ValueError(f"Spectrogram must have at least two dimensions (time and frequency) ({dim} given).")
+    if axis not in [dim - 2, dim - 1]:
+        raise ValueError(
+            f"Only Frequency and Time masking are supported (axis {dim-2} and axis {dim-1} supported; {axis} given)."
+        )
     if not 0.0 <= p <= 1.0:
         raise ValueError(f"The value of p must be between 0.0 and 1.0 ({p} given).")
@@ -908,14 +930,17 @@ def mask_along_axis(
     # pack batch
     shape = specgram.size()
     specgram = specgram.reshape([-1] + list(shape[-2:]))
+    # After packing, specgram is a 3D tensor, and the axis corresponding to the to-be-masked dimension
+    # is now (axis - dim + 3), e.g. a tensor of shape (10, 2, 50, 10, 2) becomes a tensor of shape (1000, 10, 2).
     value = torch.rand(1) * mask_param
-    min_value = torch.rand(1) * (specgram.size(axis) - value)
+    min_value = torch.rand(1) * (specgram.size(axis - dim + 3) - value)
     mask_start = (min_value.long()).squeeze()
     mask_end = (min_value.long() + value.long()).squeeze()
-    mask = torch.arange(0, specgram.shape[axis], device=specgram.device, dtype=specgram.dtype)
+    mask = torch.arange(0, specgram.shape[axis - dim + 3], device=specgram.device, dtype=specgram.dtype)
     mask = (mask >= mask_start) & (mask < mask_end)
-    if axis == 1:
+    # unsqueeze the mask if the axis is frequency
+    if axis == dim - 2:
         mask = mask.unsqueeze(-1)
     if mask_end - mask_start >= mask_param:
@@ -1019,8 +1044,8 @@ def _compute_nccf(waveform: Tensor, sample_rate: int, frame_time: float, freq_lo
         output_frames = (
             (s1 * s2).sum(-1)
-            / (EPSILON + torch.norm(s1, p=2, dim=-1)).pow(2)
-            / (EPSILON + torch.norm(s2, p=2, dim=-1)).pow(2)
+            / (EPSILON + torch.linalg.vector_norm(s1, ord=2, dim=-1)).pow(2)
+            / (EPSILON + torch.linalg.vector_norm(s2, ord=2, dim=-1)).pow(2)
         )
         output_lag.append(output_frames.unsqueeze(-1))
@@ -1271,6 +1296,7 @@ def spectral_centroid(
 @torchaudio._extension.fail_if_no_sox
+@deprecated("Please migrate to :py:class:`torchaudio.io.AudioEffector`.", remove=False)
 def apply_codec(
     waveform: Tensor,
     sample_rate: int,
@@ -1303,129 +1329,17 @@ def apply_codec(
         Tensor: Resulting Tensor.
         If ``channels_first=True``, it has `(channel, time)` else `(time, channel)`.
     """
-    bytes = io.BytesIO()
-    torchaudio.backend.sox_io_backend.save(
-        bytes, waveform, sample_rate, channels_first, compression, format, encoding, bits_per_sample
-    )
-    bytes.seek(0)
-    augmented, sr = torchaudio.backend.sox_io_backend.load(bytes, channels_first=channels_first, format=format)
+    with tempfile.NamedTemporaryFile() as f:
+        torchaudio.backend.sox_io_backend.save(
+            f.name, waveform, sample_rate, channels_first, compression, format, encoding, bits_per_sample
+        )
+        augmented, sr = torchaudio.backend.sox_io_backend.load(f.name, channels_first=channels_first, format=format)
     if sr != sample_rate:
         augmented = resample(augmented, sr, sample_rate)
     return augmented
-@torchaudio._extension.fail_if_no_kaldi
-def compute_kaldi_pitch(
-    waveform: torch.Tensor,
-    sample_rate: float,
-    frame_length: float = 25.0,
-    frame_shift: float = 10.0,
-    min_f0: float = 50,
-    max_f0: float = 400,
-    soft_min_f0: float = 10.0,
-    penalty_factor: float = 0.1,
-    lowpass_cutoff: float = 1000,
-    resample_frequency: float = 4000,
-    delta_pitch: float = 0.005,
-    nccf_ballast: float = 7000,
-    lowpass_filter_width: int = 1,
-    upsample_filter_width: int = 5,
-    max_frames_latency: int = 0,
-    frames_per_chunk: int = 0,
-    simulate_first_pass_online: bool = False,
-    recompute_frame: int = 500,
-    snip_edges: bool = True,
-) -> torch.Tensor:
-    """Extract pitch based on method described in *A pitch extraction algorithm tuned
-    for automatic speech recognition* :cite:`6854049`.
-    .. devices:: CPU
-    .. properties:: TorchScript
-    This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi.
-    Args:
-        waveform (Tensor):
-            The input waveform of shape `(..., time)`.
-        sample_rate (float):
-            Sample rate of `waveform`.
-        frame_length (float, optional):
-            Frame length in milliseconds. (default: 25.0)
-        frame_shift (float, optional):
-            Frame shift in milliseconds. (default: 10.0)
-        min_f0 (float, optional):
-            Minimum F0 to search for (Hz)  (default: 50.0)
-        max_f0 (float, optional):
-            Maximum F0 to search for (Hz)  (default: 400.0)
-        soft_min_f0 (float, optional):
-            Minimum f0, applied in soft way, must not exceed min-f0  (default: 10.0)
-        penalty_factor (float, optional):
-            Cost factor for FO change.  (default: 0.1)
-        lowpass_cutoff (float, optional):
-            Cutoff frequency for LowPass filter (Hz) (default: 1000)
-        resample_frequency (float, optional):
-            Frequency that we down-sample the signal to. Must be more than twice lowpass-cutoff.
-            (default: 4000)
-        delta_pitch( float, optional):
-            Smallest relative change in pitch that our algorithm measures. (default: 0.005)
-        nccf_ballast (float, optional):
-            Increasing this factor reduces NCCF for quiet frames (default: 7000)
-        lowpass_filter_width (int, optional):
-            Integer that determines filter width of lowpass filter, more gives sharper filter.
-            (default: 1)
-        upsample_filter_width (int, optional):
-            Integer that determines filter width when upsampling NCCF. (default: 5)
-        max_frames_latency (int, optional):
-            Maximum number of frames of latency that we allow pitch tracking to introduce into
-            the feature processing (affects output only if ``frames_per_chunk > 0`` and
-            ``simulate_first_pass_online=True``) (default: 0)
-        frames_per_chunk (int, optional):
-            The number of frames used for energy normalization. (default: 0)
-        simulate_first_pass_online (bool, optional):
-            If true, the function will output features that correspond to what an online decoder
-            would see in the first pass of decoding -- not the final version of the features,
-            which is the default. (default: False)
-            Relevant if ``frames_per_chunk > 0``.
-        recompute_frame (int, optional):
-            Only relevant for compatibility with online pitch extraction.
-            A non-critical parameter; the frame at which we recompute some of the forward pointers,
-            after revising our estimate of the signal energy.
-            Relevant if ``frames_per_chunk > 0``. (default: 500)
-        snip_edges (bool, optional):
-            If this is set to false, the incomplete frames near the ending edge won't be snipped,
-            so that the number of frames is the file size divided by the frame-shift.
-            This makes different types of features give the same number of frames. (default: True)
-    Returns:
-       Tensor: Pitch feature. Shape: `(batch, frames 2)` where the last dimension
-       corresponds to pitch and NCCF.
-    """
-    shape = waveform.shape
-    waveform = waveform.reshape(-1, shape[-1])
-    result = torch.ops.torchaudio.kaldi_ComputeKaldiPitch(
-        waveform,
-        sample_rate,
-        frame_length,
-        frame_shift,
-        min_f0,
-        max_f0,
-        soft_min_f0,
-        penalty_factor,
-        lowpass_cutoff,
-        resample_frequency,
-        delta_pitch,
-        nccf_ballast,
-        lowpass_filter_width,
-        upsample_filter_width,
-        max_frames_latency,
-        frames_per_chunk,
-        simulate_first_pass_online,
-        recompute_frame,
-        snip_edges,
-    )
-    result = result.reshape(shape[:-1] + result.shape[-2:])
-    return result
+_CPU = torch.device("cpu")
 def _get_sinc_resample_kernel(
@@ -1436,10 +1350,9 @@ def _get_sinc_resample_kernel(
     rolloff: float = 0.99,
     resampling_method: str = "sinc_interp_hann",
     beta: Optional[float] = None,
-    device: torch.device = torch.device("cpu"),
+    device: torch.device = _CPU,
     dtype: Optional[torch.dtype] = None,
 ):
     if not (int(orig_freq) == orig_freq and int(new_freq) == new_freq):
         raise Exception(
             "Frequencies must be of integer type to ensure quality resampling computation. "
@@ -1550,7 +1463,7 @@ def _apply_sinc_resample_kernel(
     waveform = torch.nn.functional.pad(waveform, (width, width + orig_freq))
     resampled = torch.nn.functional.conv1d(waveform[:, None], kernel, stride=orig_freq)
     resampled = resampled.transpose(1, 2).reshape(num_wavs, -1)
-    target_length = int(math.ceil(new_freq * length / orig_freq))
+    target_length = torch.ceil(torch.as_tensor(new_freq * length / orig_freq)).long()
     resampled = resampled[..., :target_length]
     # unpack batch
@@ -2580,3 +2493,41 @@ def deemphasis(waveform, coeff: float = 0.97) -> torch.Tensor:
     a_coeffs = torch.tensor([1.0, -coeff], dtype=waveform.dtype, device=waveform.device)
     b_coeffs = torch.tensor([1.0, 0.0], dtype=waveform.dtype, device=waveform.device)
     return torchaudio.functional.lfilter(waveform, a_coeffs=a_coeffs, b_coeffs=b_coeffs)
+def frechet_distance(mu_x, sigma_x, mu_y, sigma_y):
+    r"""Computes the Fréchet distance between two multivariate normal distributions :cite:`dowson1982frechet`.
+    Concretely, for multivariate Gaussians :math:`X(\mu_X, \Sigma_X)`
+    and :math:`Y(\mu_Y, \Sigma_Y)`, the function computes and returns :math:`F` as
+    .. math::
+        F(X, Y) = || \mu_X - \mu_Y ||_2^2
+        + \text{Tr}\left( \Sigma_X + \Sigma_Y - 2 \sqrt{\Sigma_X \Sigma_Y} \right)
+    Args:
+        mu_x (torch.Tensor): mean :math:`\mu_X` of multivariate Gaussian :math:`X`, with shape `(N,)`.
+        sigma_x (torch.Tensor): covariance matrix :math:`\Sigma_X` of :math:`X`, with shape `(N, N)`.
+        mu_y (torch.Tensor): mean :math:`\mu_Y` of multivariate Gaussian :math:`Y`, with shape `(N,)`.
+        sigma_y (torch.Tensor): covariance matrix :math:`\Sigma_Y` of :math:`Y`, with shape `(N, N)`.
+    Returns:
+        torch.Tensor: the Fréchet distance between :math:`X` and :math:`Y`.
+    """
+    if len(mu_x.size()) != 1:
+        raise ValueError(f"Input mu_x must be one-dimensional; got dimension {len(mu_x.size())}.")
+    if len(sigma_x.size()) != 2:
+        raise ValueError(f"Input sigma_x must be two-dimensional; got dimension {len(sigma_x.size())}.")
+    if sigma_x.size(0) != sigma_x.size(1) != mu_x.size(0):
+        raise ValueError("Each of sigma_x's dimensions must match mu_x's size.")
+    if mu_x.size() != mu_y.size():
+        raise ValueError(f"Inputs mu_x and mu_y must have the same shape; got {mu_x.size()} and {mu_y.size()}.")
+    if sigma_x.size() != sigma_y.size():
+        raise ValueError(
+            f"Inputs sigma_x and sigma_y must have the same shape; got {sigma_x.size()} and {sigma_y.size()}."
+        )
+    a = (mu_x - mu_y).square().sum()
+    b = sigma_x.trace() + sigma_y.trace()
+    c = torch.linalg.eigvals(sigma_x @ sigma_y).sqrt().real.sum()
+    return a + b - 2 * c

torchaudio/io/__init__.py CHANGED Viewed

@@ -1,10 +1,13 @@
+from ._effector import AudioEffector
 from ._playback import play_audio
 from ._stream_reader import StreamReader
-from ._stream_writer import StreamWriter
+from ._stream_writer import CodecConfig, StreamWriter
 __all__ = [
+    "AudioEffector",
     "StreamReader",
     "StreamWriter",
+    "CodecConfig",
     "play_audio",
 ]