torchaudio 2.0.2__cp38-cp38-manylinux1_x86_64.whl → 2.1.1__cp38-cp38-manylinux1_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchaudio might be problematic. Click here for more details.
- torchaudio/__init__.py +22 -3
- torchaudio/_backend/__init__.py +55 -4
- torchaudio/_backend/backend.py +53 -0
- torchaudio/_backend/common.py +52 -0
- torchaudio/_backend/ffmpeg.py +373 -0
- torchaudio/_backend/soundfile.py +54 -0
- torchaudio/_backend/soundfile_backend.py +457 -0
- torchaudio/_backend/sox.py +91 -0
- torchaudio/_backend/utils.py +81 -323
- torchaudio/_extension/__init__.py +55 -36
- torchaudio/_extension/utils.py +109 -17
- torchaudio/_internal/__init__.py +4 -1
- torchaudio/_internal/module_utils.py +37 -6
- torchaudio/backend/__init__.py +7 -11
- torchaudio/backend/_no_backend.py +24 -0
- torchaudio/backend/_sox_io_backend.py +297 -0
- torchaudio/backend/common.py +12 -52
- torchaudio/backend/no_backend.py +11 -21
- torchaudio/backend/soundfile_backend.py +11 -448
- torchaudio/backend/sox_io_backend.py +11 -435
- torchaudio/backend/utils.py +9 -18
- torchaudio/datasets/__init__.py +2 -0
- torchaudio/datasets/cmuarctic.py +1 -1
- torchaudio/datasets/cmudict.py +61 -62
- torchaudio/datasets/dr_vctk.py +1 -1
- torchaudio/datasets/gtzan.py +1 -1
- torchaudio/datasets/librilight_limited.py +1 -1
- torchaudio/datasets/librispeech.py +1 -1
- torchaudio/datasets/librispeech_biasing.py +189 -0
- torchaudio/datasets/libritts.py +1 -1
- torchaudio/datasets/ljspeech.py +1 -1
- torchaudio/datasets/musdb_hq.py +1 -1
- torchaudio/datasets/quesst14.py +1 -1
- torchaudio/datasets/speechcommands.py +1 -1
- torchaudio/datasets/tedlium.py +1 -1
- torchaudio/datasets/vctk.py +1 -1
- torchaudio/datasets/voxceleb1.py +1 -1
- torchaudio/datasets/yesno.py +1 -1
- torchaudio/functional/__init__.py +6 -2
- torchaudio/functional/_alignment.py +128 -0
- torchaudio/functional/filtering.py +69 -92
- torchaudio/functional/functional.py +99 -148
- torchaudio/io/__init__.py +4 -1
- torchaudio/io/_effector.py +347 -0
- torchaudio/io/_stream_reader.py +158 -90
- torchaudio/io/_stream_writer.py +196 -10
- torchaudio/lib/_torchaudio.so +0 -0
- torchaudio/lib/_torchaudio_ffmpeg4.so +0 -0
- torchaudio/lib/_torchaudio_ffmpeg5.so +0 -0
- torchaudio/lib/_torchaudio_ffmpeg6.so +0 -0
- torchaudio/lib/_torchaudio_sox.so +0 -0
- torchaudio/lib/libctc_prefix_decoder.so +0 -0
- torchaudio/lib/libtorchaudio.so +0 -0
- torchaudio/lib/libtorchaudio_ffmpeg4.so +0 -0
- torchaudio/lib/libtorchaudio_ffmpeg5.so +0 -0
- torchaudio/lib/libtorchaudio_ffmpeg6.so +0 -0
- torchaudio/lib/libtorchaudio_sox.so +0 -0
- torchaudio/lib/pybind11_prefixctc.so +0 -0
- torchaudio/models/__init__.py +14 -0
- torchaudio/models/decoder/__init__.py +22 -7
- torchaudio/models/decoder/_ctc_decoder.py +123 -69
- torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
- torchaudio/models/rnnt_decoder.py +10 -14
- torchaudio/models/squim/__init__.py +11 -0
- torchaudio/models/squim/objective.py +326 -0
- torchaudio/models/squim/subjective.py +150 -0
- torchaudio/models/wav2vec2/components.py +6 -10
- torchaudio/pipelines/__init__.py +9 -0
- torchaudio/pipelines/_squim_pipeline.py +176 -0
- torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
- torchaudio/pipelines/_wav2vec2/impl.py +198 -68
- torchaudio/pipelines/_wav2vec2/utils.py +120 -0
- torchaudio/sox_effects/sox_effects.py +7 -30
- torchaudio/transforms/__init__.py +2 -0
- torchaudio/transforms/_transforms.py +99 -54
- torchaudio/utils/download.py +2 -2
- torchaudio/utils/ffmpeg_utils.py +20 -15
- torchaudio/utils/sox_utils.py +8 -9
- torchaudio/version.py +2 -2
- torchaudio-2.1.1.dist-info/METADATA +113 -0
- torchaudio-2.1.1.dist-info/RECORD +119 -0
- torchaudio/io/_compat.py +0 -241
- torchaudio/lib/_torchaudio_ffmpeg.so +0 -0
- torchaudio/lib/flashlight_lib_text_decoder.so +0 -0
- torchaudio/lib/flashlight_lib_text_dictionary.so +0 -0
- torchaudio/lib/libflashlight-text.so +0 -0
- torchaudio/lib/libtorchaudio_ffmpeg.so +0 -0
- torchaudio-2.0.2.dist-info/METADATA +0 -26
- torchaudio-2.0.2.dist-info/RECORD +0 -100
- {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/LICENSE +0 -0
- {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/WHEEL +0 -0
- {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/top_level.txt +0 -0
|
@@ -36,7 +36,7 @@ class Spectrogram(torch.nn.Module):
|
|
|
36
36
|
window_fn (Callable[..., Tensor], optional): A function to create a window tensor
|
|
37
37
|
that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
|
|
38
38
|
power (float or None, optional): Exponent for the magnitude spectrogram,
|
|
39
|
-
(must be > 0) e.g., 1 for
|
|
39
|
+
(must be > 0) e.g., 1 for magnitude, 2 for power, etc.
|
|
40
40
|
If None, then the complex spectrum is returned instead. (Default: ``2``)
|
|
41
41
|
normalized (bool or str, optional): Whether to normalize by magnitude after stft. If input is str, choices are
|
|
42
42
|
``"window"`` and ``"frame_length"``, if specific normalization type is desirable. ``True`` maps to
|
|
@@ -227,7 +227,7 @@ class GriffinLim(torch.nn.Module):
|
|
|
227
227
|
window_fn (Callable[..., Tensor], optional): A function to create a window tensor
|
|
228
228
|
that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
|
|
229
229
|
power (float, optional): Exponent for the magnitude spectrogram,
|
|
230
|
-
(must be > 0) e.g., 1 for
|
|
230
|
+
(must be > 0) e.g., 1 for magnitude, 2 for power, etc. (Default: ``2``)
|
|
231
231
|
wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
|
|
232
232
|
momentum (float, optional): The momentum parameter for fast Griffin-Lim.
|
|
233
233
|
Setting this to 0 recovers the original Griffin-Lim method.
|
|
@@ -420,7 +420,7 @@ class InverseMelScale(torch.nn.Module):
|
|
|
420
420
|
.. devices:: CPU CUDA
|
|
421
421
|
|
|
422
422
|
It minimizes the euclidian norm between the input mel-spectrogram and the product between
|
|
423
|
-
the estimated spectrogram and the filter banks using
|
|
423
|
+
the estimated spectrogram and the filter banks using `torch.linalg.lstsq`.
|
|
424
424
|
|
|
425
425
|
Args:
|
|
426
426
|
n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
|
|
@@ -428,13 +428,13 @@ class InverseMelScale(torch.nn.Module):
|
|
|
428
428
|
sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
|
|
429
429
|
f_min (float, optional): Minimum frequency. (Default: ``0.``)
|
|
430
430
|
f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
|
|
431
|
-
max_iter (int, optional): Maximum number of optimization iterations. (Default: ``100000``)
|
|
432
|
-
tolerance_loss (float, optional): Value of loss to stop optimization at. (Default: ``1e-5``)
|
|
433
|
-
tolerance_change (float, optional): Difference in losses to stop optimization at. (Default: ``1e-8``)
|
|
434
|
-
sgdargs (dict or None, optional): Arguments for the SGD optimizer. (Default: ``None``)
|
|
435
431
|
norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
|
|
436
432
|
(area normalization). (Default: ``None``)
|
|
437
433
|
mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
|
|
434
|
+
driver (str, optional): Name of the LAPACK/MAGMA method to be used for `torch.lstsq`.
|
|
435
|
+
For CPU inputs the valid values are ``"gels"``, ``"gelsy"``, ``"gelsd"``, ``"gelss"``.
|
|
436
|
+
For CUDA input, the only valid driver is ``"gels"``, which assumes that A is full-rank.
|
|
437
|
+
(Default: ``"gels``)
|
|
438
438
|
|
|
439
439
|
Example
|
|
440
440
|
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
|
|
@@ -449,10 +449,6 @@ class InverseMelScale(torch.nn.Module):
|
|
|
449
449
|
"sample_rate",
|
|
450
450
|
"f_min",
|
|
451
451
|
"f_max",
|
|
452
|
-
"max_iter",
|
|
453
|
-
"tolerance_loss",
|
|
454
|
-
"tolerance_change",
|
|
455
|
-
"sgdargs",
|
|
456
452
|
]
|
|
457
453
|
|
|
458
454
|
def __init__(
|
|
@@ -462,26 +458,23 @@ class InverseMelScale(torch.nn.Module):
|
|
|
462
458
|
sample_rate: int = 16000,
|
|
463
459
|
f_min: float = 0.0,
|
|
464
460
|
f_max: Optional[float] = None,
|
|
465
|
-
max_iter: int = 100000,
|
|
466
|
-
tolerance_loss: float = 1e-5,
|
|
467
|
-
tolerance_change: float = 1e-8,
|
|
468
|
-
sgdargs: Optional[dict] = None,
|
|
469
461
|
norm: Optional[str] = None,
|
|
470
462
|
mel_scale: str = "htk",
|
|
463
|
+
driver: str = "gels",
|
|
471
464
|
) -> None:
|
|
472
465
|
super(InverseMelScale, self).__init__()
|
|
473
466
|
self.n_mels = n_mels
|
|
474
467
|
self.sample_rate = sample_rate
|
|
475
468
|
self.f_max = f_max or float(sample_rate // 2)
|
|
476
469
|
self.f_min = f_min
|
|
477
|
-
self.
|
|
478
|
-
self.tolerance_loss = tolerance_loss
|
|
479
|
-
self.tolerance_change = tolerance_change
|
|
480
|
-
self.sgdargs = sgdargs or {"lr": 0.1, "momentum": 0.9}
|
|
470
|
+
self.driver = driver
|
|
481
471
|
|
|
482
472
|
if f_min > self.f_max:
|
|
483
473
|
raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
|
|
484
474
|
|
|
475
|
+
if driver not in ["gels", "gelsy", "gelsd", "gelss"]:
|
|
476
|
+
raise ValueError(f'driver must be one of ["gels", "gelsy", "gelsd", "gelss"]. Found {driver}.')
|
|
477
|
+
|
|
485
478
|
fb = F.melscale_fbanks(n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, norm, mel_scale)
|
|
486
479
|
self.register_buffer("fb", fb)
|
|
487
480
|
|
|
@@ -499,34 +492,10 @@ class InverseMelScale(torch.nn.Module):
|
|
|
499
492
|
|
|
500
493
|
n_mels, time = shape[-2], shape[-1]
|
|
501
494
|
freq, _ = self.fb.size() # (freq, n_mels)
|
|
502
|
-
melspec = melspec.transpose(-1, -2)
|
|
503
495
|
if self.n_mels != n_mels:
|
|
504
496
|
raise ValueError("Expected an input with {} mel bins. Found: {}".format(self.n_mels, n_mels))
|
|
505
497
|
|
|
506
|
-
specgram = torch.
|
|
507
|
-
melspec.size()[0], time, freq, requires_grad=True, dtype=melspec.dtype, device=melspec.device
|
|
508
|
-
)
|
|
509
|
-
|
|
510
|
-
optim = torch.optim.SGD([specgram], **self.sgdargs)
|
|
511
|
-
|
|
512
|
-
loss = float("inf")
|
|
513
|
-
for _ in range(self.max_iter):
|
|
514
|
-
optim.zero_grad()
|
|
515
|
-
diff = melspec - specgram.matmul(self.fb)
|
|
516
|
-
new_loss = diff.pow(2).sum(axis=-1).mean()
|
|
517
|
-
# take sum over mel-frequency then average over other dimensions
|
|
518
|
-
# so that loss threshold is applied par unit timeframe
|
|
519
|
-
new_loss.backward()
|
|
520
|
-
optim.step()
|
|
521
|
-
specgram.data = specgram.data.clamp(min=0)
|
|
522
|
-
|
|
523
|
-
new_loss = new_loss.item()
|
|
524
|
-
if new_loss < self.tolerance_loss or abs(loss - new_loss) < self.tolerance_change:
|
|
525
|
-
break
|
|
526
|
-
loss = new_loss
|
|
527
|
-
|
|
528
|
-
specgram.requires_grad_(False)
|
|
529
|
-
specgram = specgram.clamp(min=0).transpose(-1, -2)
|
|
498
|
+
specgram = torch.relu(torch.linalg.lstsq(self.fb.transpose(-1, -2)[None], melspec, driver=self.driver).solution)
|
|
530
499
|
|
|
531
500
|
# unpack batch
|
|
532
501
|
specgram = specgram.view(shape[:-2] + (freq, time))
|
|
@@ -540,7 +509,7 @@ class MelSpectrogram(torch.nn.Module):
|
|
|
540
509
|
|
|
541
510
|
.. properties:: Autograd TorchScript
|
|
542
511
|
|
|
543
|
-
This is a composition of :py:func:`torchaudio.transforms.Spectrogram`
|
|
512
|
+
This is a composition of :py:func:`torchaudio.transforms.Spectrogram`
|
|
544
513
|
and :py:func:`torchaudio.transforms.MelScale`.
|
|
545
514
|
|
|
546
515
|
Sources
|
|
@@ -560,7 +529,7 @@ class MelSpectrogram(torch.nn.Module):
|
|
|
560
529
|
window_fn (Callable[..., Tensor], optional): A function to create a window tensor
|
|
561
530
|
that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
|
|
562
531
|
power (float, optional): Exponent for the magnitude spectrogram,
|
|
563
|
-
(must be > 0) e.g., 1 for
|
|
532
|
+
(must be > 0) e.g., 1 for magnitude, 2 for power, etc. (Default: ``2``)
|
|
564
533
|
normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
|
|
565
534
|
wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
|
|
566
535
|
center (bool, optional): whether to pad :attr:`waveform` on both sides so
|
|
@@ -1196,15 +1165,16 @@ class _AxisMasking(torch.nn.Module):
|
|
|
1196
1165
|
|
|
1197
1166
|
Args:
|
|
1198
1167
|
mask_param (int): Maximum possible length of the mask.
|
|
1199
|
-
axis (int): What dimension the mask is applied on.
|
|
1168
|
+
axis (int): What dimension the mask is applied on (assuming the tensor is 3D).
|
|
1169
|
+
For frequency masking, axis = 1.
|
|
1170
|
+
For time masking, axis = 2.
|
|
1200
1171
|
iid_masks (bool): Applies iid masks to each of the examples in the batch dimension.
|
|
1201
|
-
This option is applicable only when the input tensor is
|
|
1172
|
+
This option is applicable only when the dimension of the input tensor is >= 3.
|
|
1202
1173
|
p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
|
|
1203
1174
|
"""
|
|
1204
1175
|
__constants__ = ["mask_param", "axis", "iid_masks", "p"]
|
|
1205
1176
|
|
|
1206
1177
|
def __init__(self, mask_param: int, axis: int, iid_masks: bool, p: float = 1.0) -> None:
|
|
1207
|
-
|
|
1208
1178
|
super(_AxisMasking, self).__init__()
|
|
1209
1179
|
self.mask_param = mask_param
|
|
1210
1180
|
self.axis = axis
|
|
@@ -1221,10 +1191,14 @@ class _AxisMasking(torch.nn.Module):
|
|
|
1221
1191
|
Tensor: Masked spectrogram of dimensions `(..., freq, time)`.
|
|
1222
1192
|
"""
|
|
1223
1193
|
# if iid_masks flag marked and specgram has a batch dimension
|
|
1224
|
-
|
|
1225
|
-
|
|
1194
|
+
# self.axis + specgram.dim() - 3 gives the time/frequency dimension (last two dimensions)
|
|
1195
|
+
# for input tensor for which the dimension is not 3.
|
|
1196
|
+
if self.iid_masks:
|
|
1197
|
+
return F.mask_along_axis_iid(
|
|
1198
|
+
specgram, self.mask_param, mask_value, self.axis + specgram.dim() - 3, p=self.p
|
|
1199
|
+
)
|
|
1226
1200
|
else:
|
|
1227
|
-
return F.mask_along_axis(specgram, self.mask_param, mask_value, self.axis, p=self.p)
|
|
1201
|
+
return F.mask_along_axis(specgram, self.mask_param, mask_value, self.axis + specgram.dim() - 3, p=self.p)
|
|
1228
1202
|
|
|
1229
1203
|
|
|
1230
1204
|
class FrequencyMasking(_AxisMasking):
|
|
@@ -1241,7 +1215,7 @@ class FrequencyMasking(_AxisMasking):
|
|
|
1241
1215
|
Indices uniformly sampled from [0, freq_mask_param).
|
|
1242
1216
|
iid_masks (bool, optional): whether to apply different masks to each
|
|
1243
1217
|
example/channel in the batch. (Default: ``False``)
|
|
1244
|
-
This option is applicable only when the input tensor
|
|
1218
|
+
This option is applicable only when the input tensor >= 3D.
|
|
1245
1219
|
|
|
1246
1220
|
Example
|
|
1247
1221
|
>>> spectrogram = torchaudio.transforms.Spectrogram()
|
|
@@ -1275,7 +1249,7 @@ class TimeMasking(_AxisMasking):
|
|
|
1275
1249
|
Indices uniformly sampled from [0, time_mask_param).
|
|
1276
1250
|
iid_masks (bool, optional): whether to apply different masks to each
|
|
1277
1251
|
example/channel in the batch. (Default: ``False``)
|
|
1278
|
-
This option is applicable only when the input tensor
|
|
1252
|
+
This option is applicable only when the input tensor >= 3D.
|
|
1279
1253
|
p (float, optional): maximum proportion of time steps that can be masked.
|
|
1280
1254
|
Must be within range [0.0, 1.0]. (Default: 1.0)
|
|
1281
1255
|
|
|
@@ -1299,6 +1273,77 @@ class TimeMasking(_AxisMasking):
|
|
|
1299
1273
|
super(TimeMasking, self).__init__(time_mask_param, 2, iid_masks, p=p)
|
|
1300
1274
|
|
|
1301
1275
|
|
|
1276
|
+
class SpecAugment(torch.nn.Module):
|
|
1277
|
+
r"""Apply time and frequency masking to a spectrogram.
|
|
1278
|
+
Args:
|
|
1279
|
+
n_time_masks (int): Number of time masks. If its value is zero, no time masking will be applied.
|
|
1280
|
+
time_mask_param (int): Maximum possible length of the time mask.
|
|
1281
|
+
n_freq_masks (int): Number of frequency masks. If its value is zero, no frequency masking will be applied.
|
|
1282
|
+
freq_mask_param (int): Maximum possible length of the frequency mask.
|
|
1283
|
+
iid_masks (bool, optional): Applies iid masks to each of the examples in the batch dimension.
|
|
1284
|
+
This option is applicable only when the input tensor is 4D. (Default: ``True``)
|
|
1285
|
+
p (float, optional): maximum proportion of time steps that can be masked.
|
|
1286
|
+
Must be within range [0.0, 1.0]. (Default: 1.0)
|
|
1287
|
+
zero_masking (bool, optional): If ``True``, use 0 as the mask value,
|
|
1288
|
+
else use mean of the input tensor. (Default: ``False``)
|
|
1289
|
+
"""
|
|
1290
|
+
__constants__ = [
|
|
1291
|
+
"n_time_masks",
|
|
1292
|
+
"time_mask_param",
|
|
1293
|
+
"n_freq_masks",
|
|
1294
|
+
"freq_mask_param",
|
|
1295
|
+
"iid_masks",
|
|
1296
|
+
"p",
|
|
1297
|
+
"zero_masking",
|
|
1298
|
+
]
|
|
1299
|
+
|
|
1300
|
+
def __init__(
|
|
1301
|
+
self,
|
|
1302
|
+
n_time_masks: int,
|
|
1303
|
+
time_mask_param: int,
|
|
1304
|
+
n_freq_masks: int,
|
|
1305
|
+
freq_mask_param: int,
|
|
1306
|
+
iid_masks: bool = True,
|
|
1307
|
+
p: float = 1.0,
|
|
1308
|
+
zero_masking: bool = False,
|
|
1309
|
+
) -> None:
|
|
1310
|
+
super(SpecAugment, self).__init__()
|
|
1311
|
+
self.n_time_masks = n_time_masks
|
|
1312
|
+
self.time_mask_param = time_mask_param
|
|
1313
|
+
self.n_freq_masks = n_freq_masks
|
|
1314
|
+
self.freq_mask_param = freq_mask_param
|
|
1315
|
+
self.iid_masks = iid_masks
|
|
1316
|
+
self.p = p
|
|
1317
|
+
self.zero_masking = zero_masking
|
|
1318
|
+
|
|
1319
|
+
def forward(self, specgram: Tensor) -> Tensor:
|
|
1320
|
+
r"""
|
|
1321
|
+
Args:
|
|
1322
|
+
specgram (Tensor): Tensor of shape `(..., freq, time)`.
|
|
1323
|
+
Returns:
|
|
1324
|
+
Tensor: Masked spectrogram of shape `(..., freq, time)`.
|
|
1325
|
+
"""
|
|
1326
|
+
if self.zero_masking:
|
|
1327
|
+
mask_value = 0.0
|
|
1328
|
+
else:
|
|
1329
|
+
mask_value = specgram.mean()
|
|
1330
|
+
time_dim = specgram.dim() - 1
|
|
1331
|
+
freq_dim = time_dim - 1
|
|
1332
|
+
|
|
1333
|
+
if specgram.dim() > 2 and self.iid_masks is True:
|
|
1334
|
+
for _ in range(self.n_time_masks):
|
|
1335
|
+
specgram = F.mask_along_axis_iid(specgram, self.time_mask_param, mask_value, time_dim, p=self.p)
|
|
1336
|
+
for _ in range(self.n_freq_masks):
|
|
1337
|
+
specgram = F.mask_along_axis_iid(specgram, self.freq_mask_param, mask_value, freq_dim, p=self.p)
|
|
1338
|
+
else:
|
|
1339
|
+
for _ in range(self.n_time_masks):
|
|
1340
|
+
specgram = F.mask_along_axis(specgram, self.time_mask_param, mask_value, time_dim, p=self.p)
|
|
1341
|
+
for _ in range(self.n_freq_masks):
|
|
1342
|
+
specgram = F.mask_along_axis(specgram, self.freq_mask_param, mask_value, freq_dim, p=self.p)
|
|
1343
|
+
|
|
1344
|
+
return specgram
|
|
1345
|
+
|
|
1346
|
+
|
|
1302
1347
|
class Loudness(torch.nn.Module):
|
|
1303
1348
|
r"""Measure audio loudness according to the ITU-R BS.1770-4 recommendation.
|
|
1304
1349
|
|
torchaudio/utils/download.py
CHANGED
|
@@ -5,7 +5,7 @@ from pathlib import Path
|
|
|
5
5
|
from typing import Union
|
|
6
6
|
|
|
7
7
|
import torch
|
|
8
|
-
|
|
8
|
+
from torchaudio._internal import download_url_to_file
|
|
9
9
|
|
|
10
10
|
_LG = logging.getLogger(__name__)
|
|
11
11
|
|
|
@@ -18,7 +18,7 @@ def _get_local_path(key):
|
|
|
18
18
|
|
|
19
19
|
def _download(key, path, progress):
|
|
20
20
|
url = f"https://download.pytorch.org/torchaudio/{key}"
|
|
21
|
-
|
|
21
|
+
download_url_to_file(url, path, progress=progress)
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def _get_hash(path, hash, chunk_size=1028):
|
torchaudio/utils/ffmpeg_utils.py
CHANGED
|
@@ -4,7 +4,6 @@ It affects functionalities in :py:mod:`torchaudio.io` (and indirectly :py:func:`
|
|
|
4
4
|
"""
|
|
5
5
|
from typing import Dict, List, Tuple
|
|
6
6
|
|
|
7
|
-
import torch
|
|
8
7
|
import torchaudio
|
|
9
8
|
|
|
10
9
|
|
|
@@ -16,7 +15,7 @@ def get_versions() -> Dict[str, Tuple[int]]:
|
|
|
16
15
|
dict: mapping from library names to version string,
|
|
17
16
|
i.e. `"libavutil": (56, 22, 100)`.
|
|
18
17
|
"""
|
|
19
|
-
return
|
|
18
|
+
return torchaudio._extension._FFMPEG_EXT.get_versions()
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
@torchaudio._extension.fail_if_no_ffmpeg
|
|
@@ -25,7 +24,7 @@ def get_log_level() -> int:
|
|
|
25
24
|
|
|
26
25
|
See :py:func:`set_log_level` for the detailo.
|
|
27
26
|
"""
|
|
28
|
-
return
|
|
27
|
+
return torchaudio._extension._FFMPEG_EXT.get_log_level()
|
|
29
28
|
|
|
30
29
|
|
|
31
30
|
@torchaudio._extension.fail_if_no_ffmpeg
|
|
@@ -62,7 +61,7 @@ def set_log_level(level: int):
|
|
|
62
61
|
Extremely verbose debugging, useful for libav* development.
|
|
63
62
|
|
|
64
63
|
"""
|
|
65
|
-
|
|
64
|
+
torchaudio._extension._FFMPEG_EXT.set_log_level(level)
|
|
66
65
|
|
|
67
66
|
|
|
68
67
|
@torchaudio._extension.fail_if_no_ffmpeg
|
|
@@ -80,7 +79,7 @@ def get_demuxers() -> Dict[str, str]:
|
|
|
80
79
|
... aax: CRI AAX
|
|
81
80
|
... ac3: raw AC-3
|
|
82
81
|
"""
|
|
83
|
-
return
|
|
82
|
+
return torchaudio._extension._FFMPEG_EXT.get_demuxers()
|
|
84
83
|
|
|
85
84
|
|
|
86
85
|
@torchaudio._extension.fail_if_no_ffmpeg
|
|
@@ -99,7 +98,7 @@ def get_muxers() -> Dict[str, str]:
|
|
|
99
98
|
... adx: CRI ADX
|
|
100
99
|
... aiff: Audio IFF
|
|
101
100
|
"""
|
|
102
|
-
return
|
|
101
|
+
return torchaudio._extension._FFMPEG_EXT.get_muxers()
|
|
103
102
|
|
|
104
103
|
|
|
105
104
|
@torchaudio._extension.fail_if_no_ffmpeg
|
|
@@ -118,7 +117,7 @@ def get_audio_decoders() -> Dict[str, str]:
|
|
|
118
117
|
... adx: CRI ADX
|
|
119
118
|
... aiff: Audio IFF
|
|
120
119
|
"""
|
|
121
|
-
return
|
|
120
|
+
return torchaudio._extension._FFMPEG_EXT.get_audio_decoders()
|
|
122
121
|
|
|
123
122
|
|
|
124
123
|
@torchaudio._extension.fail_if_no_ffmpeg
|
|
@@ -138,7 +137,7 @@ def get_audio_encoders() -> Dict[str, str]:
|
|
|
138
137
|
... ac3_fixed: ATSC A/52A (AC-3)
|
|
139
138
|
... alac: ALAC (Apple Lossless Audio Codec)
|
|
140
139
|
"""
|
|
141
|
-
return
|
|
140
|
+
return torchaudio._extension._FFMPEG_EXT.get_audio_encoders()
|
|
142
141
|
|
|
143
142
|
|
|
144
143
|
@torchaudio._extension.fail_if_no_ffmpeg
|
|
@@ -158,7 +157,7 @@ def get_video_decoders() -> Dict[str, str]:
|
|
|
158
157
|
... amv: AMV Video
|
|
159
158
|
... anm: Deluxe Paint Animation
|
|
160
159
|
"""
|
|
161
|
-
return
|
|
160
|
+
return torchaudio._extension._FFMPEG_EXT.get_video_decoders()
|
|
162
161
|
|
|
163
162
|
|
|
164
163
|
@torchaudio._extension.fail_if_no_ffmpeg
|
|
@@ -179,7 +178,7 @@ def get_video_encoders() -> Dict[str, str]:
|
|
|
179
178
|
... asv1: ASUS V1
|
|
180
179
|
... asv2: ASUS V2
|
|
181
180
|
"""
|
|
182
|
-
return
|
|
181
|
+
return torchaudio._extension._FFMPEG_EXT.get_video_encoders()
|
|
183
182
|
|
|
184
183
|
|
|
185
184
|
@torchaudio._extension.fail_if_no_ffmpeg
|
|
@@ -195,7 +194,7 @@ def get_input_devices() -> Dict[str, str]:
|
|
|
195
194
|
... avfoundation: AVFoundation input device
|
|
196
195
|
... lavfi: Libavfilter virtual input device
|
|
197
196
|
"""
|
|
198
|
-
return
|
|
197
|
+
return torchaudio._extension._FFMPEG_EXT.get_input_devices()
|
|
199
198
|
|
|
200
199
|
|
|
201
200
|
@torchaudio._extension.fail_if_no_ffmpeg
|
|
@@ -210,7 +209,7 @@ def get_output_devices() -> Dict[str, str]:
|
|
|
210
209
|
>>> print(f"{k}: {v}")
|
|
211
210
|
... audiotoolbox: AudioToolbox output device
|
|
212
211
|
"""
|
|
213
|
-
return
|
|
212
|
+
return torchaudio._extension._FFMPEG_EXT.get_output_devices()
|
|
214
213
|
|
|
215
214
|
|
|
216
215
|
@torchaudio._extension.fail_if_no_ffmpeg
|
|
@@ -224,7 +223,7 @@ def get_input_protocols() -> List[str]:
|
|
|
224
223
|
>>> print(get_input_protocols())
|
|
225
224
|
... ['file', 'ftp', 'hls', 'http','https', 'pipe', 'rtmp', 'tcp', 'tls', 'udp', 'unix']
|
|
226
225
|
"""
|
|
227
|
-
return
|
|
226
|
+
return torchaudio._extension._FFMPEG_EXT.get_input_protocols()
|
|
228
227
|
|
|
229
228
|
|
|
230
229
|
@torchaudio._extension.fail_if_no_ffmpeg
|
|
@@ -238,7 +237,7 @@ def get_output_protocols() -> List[str]:
|
|
|
238
237
|
>>> print(get_output_protocols())
|
|
239
238
|
... ['file', 'ftp', 'http', 'https', 'md5', 'pipe', 'prompeg', 'rtmp', 'tee', 'tcp', 'tls', 'udp', 'unix']
|
|
240
239
|
"""
|
|
241
|
-
return
|
|
240
|
+
return torchaudio._extension._FFMPEG_EXT.get_output_protocols()
|
|
242
241
|
|
|
243
242
|
|
|
244
243
|
@torchaudio._extension.fail_if_no_ffmpeg
|
|
@@ -252,4 +251,10 @@ def get_build_config() -> str:
|
|
|
252
251
|
>>> print(get_build_config())
|
|
253
252
|
--prefix=/Users/runner/miniforge3 --cc=arm64-apple-darwin20.0.0-clang --enable-gpl --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-neon --enable-libx264 --enable-libx265 --enable-libaom --enable-libsvtav1 --enable-libxml2 --enable-libvpx --enable-pic --enable-pthreads --enable-shared --disable-static --enable-version3 --enable-zlib --enable-libmp3lame --pkg-config=/Users/runner/miniforge3/conda-bld/ffmpeg_1646229390493/_build_env/bin/pkg-config --enable-cross-compile --arch=arm64 --target-os=darwin --cross-prefix=arm64-apple-darwin20.0.0- --host-cc=/Users/runner/miniforge3/conda-bld/ffmpeg_1646229390493/_build_env/bin/x86_64-apple-darwin13.4.0-clang # noqa
|
|
254
253
|
"""
|
|
255
|
-
return
|
|
254
|
+
return torchaudio._extension._FFMPEG_EXT.get_build_config()
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
@torchaudio._extension.fail_if_no_ffmpeg
|
|
258
|
+
def clear_cuda_context_cache():
|
|
259
|
+
"""Clear the CUDA context used by CUDA Hardware accelerated video decoding"""
|
|
260
|
+
torchaudio._extension._FFMPEG_EXT.clear_cuda_context_cache()
|
torchaudio/utils/sox_utils.py
CHANGED
|
@@ -4,7 +4,6 @@
|
|
|
4
4
|
|
|
5
5
|
from typing import Dict, List
|
|
6
6
|
|
|
7
|
-
import torch
|
|
8
7
|
import torchaudio
|
|
9
8
|
|
|
10
9
|
|
|
@@ -18,7 +17,7 @@ def set_seed(seed: int):
|
|
|
18
17
|
See Also:
|
|
19
18
|
http://sox.sourceforge.net/sox.html
|
|
20
19
|
"""
|
|
21
|
-
|
|
20
|
+
torchaudio.lib._torchaudio_sox.set_seed(seed)
|
|
22
21
|
|
|
23
22
|
|
|
24
23
|
@torchaudio._extension.fail_if_no_sox
|
|
@@ -36,7 +35,7 @@ def set_verbosity(verbosity: int):
|
|
|
36
35
|
See Also:
|
|
37
36
|
http://sox.sourceforge.net/sox.html
|
|
38
37
|
"""
|
|
39
|
-
|
|
38
|
+
torchaudio.lib._torchaudio_sox.set_verbosity(verbosity)
|
|
40
39
|
|
|
41
40
|
|
|
42
41
|
@torchaudio._extension.fail_if_no_sox
|
|
@@ -49,7 +48,7 @@ def set_buffer_size(buffer_size: int):
|
|
|
49
48
|
See Also:
|
|
50
49
|
http://sox.sourceforge.net/sox.html
|
|
51
50
|
"""
|
|
52
|
-
|
|
51
|
+
torchaudio.lib._torchaudio_sox.set_buffer_size(buffer_size)
|
|
53
52
|
|
|
54
53
|
|
|
55
54
|
@torchaudio._extension.fail_if_no_sox
|
|
@@ -63,7 +62,7 @@ def set_use_threads(use_threads: bool):
|
|
|
63
62
|
See Also:
|
|
64
63
|
http://sox.sourceforge.net/sox.html
|
|
65
64
|
"""
|
|
66
|
-
|
|
65
|
+
torchaudio.lib._torchaudio_sox.set_use_threads(use_threads)
|
|
67
66
|
|
|
68
67
|
|
|
69
68
|
@torchaudio._extension.fail_if_no_sox
|
|
@@ -73,7 +72,7 @@ def list_effects() -> Dict[str, str]:
|
|
|
73
72
|
Returns:
|
|
74
73
|
Dict[str, str]: Mapping from ``effect name`` to ``usage``
|
|
75
74
|
"""
|
|
76
|
-
return dict(
|
|
75
|
+
return dict(torchaudio.lib._torchaudio_sox.list_effects())
|
|
77
76
|
|
|
78
77
|
|
|
79
78
|
@torchaudio._extension.fail_if_no_sox
|
|
@@ -83,7 +82,7 @@ def list_read_formats() -> List[str]:
|
|
|
83
82
|
Returns:
|
|
84
83
|
List[str]: List of supported audio formats
|
|
85
84
|
"""
|
|
86
|
-
return
|
|
85
|
+
return torchaudio.lib._torchaudio_sox.list_read_formats()
|
|
87
86
|
|
|
88
87
|
|
|
89
88
|
@torchaudio._extension.fail_if_no_sox
|
|
@@ -93,7 +92,7 @@ def list_write_formats() -> List[str]:
|
|
|
93
92
|
Returns:
|
|
94
93
|
List[str]: List of supported audio formats
|
|
95
94
|
"""
|
|
96
|
-
return
|
|
95
|
+
return torchaudio.lib._torchaudio_sox.list_write_formats()
|
|
97
96
|
|
|
98
97
|
|
|
99
98
|
@torchaudio._extension.fail_if_no_sox
|
|
@@ -103,4 +102,4 @@ def get_buffer_size() -> int:
|
|
|
103
102
|
Returns:
|
|
104
103
|
int: size in bytes of buffers used for processing audio.
|
|
105
104
|
"""
|
|
106
|
-
return
|
|
105
|
+
return torchaudio.lib._torchaudio_sox.get_buffer_size()
|
torchaudio/version.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
__version__ = '2.
|
|
2
|
-
git_version = '
|
|
1
|
+
__version__ = '2.1.1+cu121'
|
|
2
|
+
git_version = '5784206b90d738de888dce4c99b8b46be213f019'
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: torchaudio
|
|
3
|
+
Version: 2.1.1
|
|
4
|
+
Summary: An audio package for PyTorch
|
|
5
|
+
Home-page: https://github.com/pytorch/audio
|
|
6
|
+
Author: Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang
|
|
7
|
+
Author-email: soumith@pytorch.org
|
|
8
|
+
Maintainer: Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang
|
|
9
|
+
Maintainer-email: moto@meta.com
|
|
10
|
+
Classifier: Environment :: Plugins
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
14
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
15
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
16
|
+
Classifier: Operating System :: POSIX
|
|
17
|
+
Classifier: Programming Language :: C++
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
23
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: torch (==2.1.1)
|
|
28
|
+
|
|
29
|
+
torchaudio: an audio library for PyTorch
|
|
30
|
+
========================================
|
|
31
|
+
|
|
32
|
+
[](https://pytorch.org/audio/main/)
|
|
33
|
+
[](https://anaconda.org/pytorch/torchaudio)
|
|
34
|
+
[](https://anaconda.org/pytorch/torchaudio)
|
|
35
|
+
|
|
36
|
+

|
|
37
|
+
|
|
38
|
+
The aim of torchaudio is to apply [PyTorch](https://github.com/pytorch/pytorch) to
|
|
39
|
+
the audio domain. By supporting PyTorch, torchaudio follows the same philosophy
|
|
40
|
+
of providing strong GPU acceleration, having a focus on trainable features through
|
|
41
|
+
the autograd system, and having consistent style (tensor names and dimension names).
|
|
42
|
+
Therefore, it is primarily a machine learning library and not a general signal
|
|
43
|
+
processing library. The benefits of PyTorch can be seen in torchaudio through
|
|
44
|
+
having all the computations be through PyTorch operations which makes it easy
|
|
45
|
+
to use and feel like a natural extension.
|
|
46
|
+
|
|
47
|
+
- [Support audio I/O (Load files, Save files)](http://pytorch.org/audio/main/)
|
|
48
|
+
- Load a variety of audio formats, such as `wav`, `mp3`, `ogg`, `flac`, `opus`, `sphere`, into a torch Tensor using SoX
|
|
49
|
+
- [Kaldi (ark/scp)](http://pytorch.org/audio/main/kaldi_io.html)
|
|
50
|
+
- [Dataloaders for common audio datasets](http://pytorch.org/audio/main/datasets.html)
|
|
51
|
+
- Audio and speech processing functions
|
|
52
|
+
- [forced_align](https://pytorch.org/audio/main/generated/torchaudio.functional.forced_align.html)
|
|
53
|
+
- Common audio transforms
|
|
54
|
+
- [Spectrogram, AmplitudeToDB, MelScale, MelSpectrogram, MFCC, MuLawEncoding, MuLawDecoding, Resample](http://pytorch.org/audio/main/transforms.html)
|
|
55
|
+
- Compliance interfaces: Run code using PyTorch that align with other libraries
|
|
56
|
+
- [Kaldi: spectrogram, fbank, mfcc](https://pytorch.org/audio/main/compliance.kaldi.html)
|
|
57
|
+
|
|
58
|
+
Installation
|
|
59
|
+
------------
|
|
60
|
+
|
|
61
|
+
Please refer to https://pytorch.org/audio/main/installation.html for installation and build process of TorchAudio.
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
API Reference
|
|
65
|
+
-------------
|
|
66
|
+
|
|
67
|
+
API Reference is located here: http://pytorch.org/audio/main/
|
|
68
|
+
|
|
69
|
+
Contributing Guidelines
|
|
70
|
+
-----------------------
|
|
71
|
+
|
|
72
|
+
Please refer to [CONTRIBUTING.md](./CONTRIBUTING.md)
|
|
73
|
+
|
|
74
|
+
Citation
|
|
75
|
+
--------
|
|
76
|
+
|
|
77
|
+
If you find this package useful, please cite as:
|
|
78
|
+
|
|
79
|
+
```bibtex
|
|
80
|
+
@article{yang2021torchaudio,
|
|
81
|
+
title={TorchAudio: Building Blocks for Audio and Speech Processing},
|
|
82
|
+
author={Yao-Yuan Yang and Moto Hira and Zhaoheng Ni and Anjali Chourdia and Artyom Astafurov and Caroline Chen and Ching-Feng Yeh and Christian Puhrsch and David Pollack and Dmitriy Genzel and Donny Greenberg and Edward Z. Yang and Jason Lian and Jay Mahadeokar and Jeff Hwang and Ji Chen and Peter Goldsborough and Prabhat Roy and Sean Narenthiran and Shinji Watanabe and Soumith Chintala and Vincent Quenneville-Bélair and Yangyang Shi},
|
|
83
|
+
journal={arXiv preprint arXiv:2110.15018},
|
|
84
|
+
year={2021}
|
|
85
|
+
}
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
```bibtex
|
|
89
|
+
@misc{hwang2023torchaudio,
|
|
90
|
+
title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch},
|
|
91
|
+
author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao and Robin Scheibler and Samuele Cornell and Sean Kim and Stavros Petridis},
|
|
92
|
+
year={2023},
|
|
93
|
+
eprint={2310.17864},
|
|
94
|
+
archivePrefix={arXiv},
|
|
95
|
+
primaryClass={eess.AS}
|
|
96
|
+
}
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Disclaimer on Datasets
|
|
100
|
+
----------------------
|
|
101
|
+
|
|
102
|
+
This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license.
|
|
103
|
+
|
|
104
|
+
If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML community!
|
|
105
|
+
|
|
106
|
+
Pre-trained Model License
|
|
107
|
+
-------------------------
|
|
108
|
+
|
|
109
|
+
The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the dataset used for training. It is your responsibility to determine whether you have permission to use the models for your use case.
|
|
110
|
+
|
|
111
|
+
For instance, SquimSubjective model is released under the Creative Commons Attribution Non Commercial 4.0 International (CC-BY-NC 4.0) license. See [the link](https://zenodo.org/record/4660670#.ZBtWPOxuerN) for additional details.
|
|
112
|
+
|
|
113
|
+
Other pre-trained models that have different license are noted in documentation. Please checkout the [documentation page](https://pytorch.org/audio/main/).
|