torchaudio 2.0.2__cp310-cp310-manylinux2014_aarch64.whl → 2.1.1__cp310-cp310-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (90) hide show
  1. torchaudio/__init__.py +22 -3
  2. torchaudio/_backend/__init__.py +55 -4
  3. torchaudio/_backend/backend.py +53 -0
  4. torchaudio/_backend/common.py +52 -0
  5. torchaudio/_backend/ffmpeg.py +373 -0
  6. torchaudio/_backend/soundfile.py +54 -0
  7. torchaudio/_backend/soundfile_backend.py +457 -0
  8. torchaudio/_backend/sox.py +91 -0
  9. torchaudio/_backend/utils.py +81 -323
  10. torchaudio/_extension/__init__.py +55 -36
  11. torchaudio/_extension/utils.py +109 -17
  12. torchaudio/_internal/__init__.py +4 -1
  13. torchaudio/_internal/module_utils.py +37 -6
  14. torchaudio/backend/__init__.py +7 -11
  15. torchaudio/backend/_no_backend.py +24 -0
  16. torchaudio/backend/_sox_io_backend.py +297 -0
  17. torchaudio/backend/common.py +12 -52
  18. torchaudio/backend/no_backend.py +11 -21
  19. torchaudio/backend/soundfile_backend.py +11 -448
  20. torchaudio/backend/sox_io_backend.py +11 -435
  21. torchaudio/backend/utils.py +9 -18
  22. torchaudio/datasets/__init__.py +2 -0
  23. torchaudio/datasets/cmuarctic.py +1 -1
  24. torchaudio/datasets/cmudict.py +61 -62
  25. torchaudio/datasets/dr_vctk.py +1 -1
  26. torchaudio/datasets/gtzan.py +1 -1
  27. torchaudio/datasets/librilight_limited.py +1 -1
  28. torchaudio/datasets/librispeech.py +1 -1
  29. torchaudio/datasets/librispeech_biasing.py +189 -0
  30. torchaudio/datasets/libritts.py +1 -1
  31. torchaudio/datasets/ljspeech.py +1 -1
  32. torchaudio/datasets/musdb_hq.py +1 -1
  33. torchaudio/datasets/quesst14.py +1 -1
  34. torchaudio/datasets/speechcommands.py +1 -1
  35. torchaudio/datasets/tedlium.py +1 -1
  36. torchaudio/datasets/vctk.py +1 -1
  37. torchaudio/datasets/voxceleb1.py +1 -1
  38. torchaudio/datasets/yesno.py +1 -1
  39. torchaudio/functional/__init__.py +6 -2
  40. torchaudio/functional/_alignment.py +128 -0
  41. torchaudio/functional/filtering.py +69 -92
  42. torchaudio/functional/functional.py +99 -148
  43. torchaudio/io/__init__.py +4 -1
  44. torchaudio/io/_effector.py +347 -0
  45. torchaudio/io/_stream_reader.py +158 -90
  46. torchaudio/io/_stream_writer.py +196 -10
  47. torchaudio/lib/_torchaudio.so +0 -0
  48. torchaudio/lib/_torchaudio_ffmpeg4.so +0 -0
  49. torchaudio/lib/_torchaudio_ffmpeg5.so +0 -0
  50. torchaudio/lib/_torchaudio_ffmpeg6.so +0 -0
  51. torchaudio/lib/_torchaudio_sox.so +0 -0
  52. torchaudio/lib/libtorchaudio.so +0 -0
  53. torchaudio/lib/libtorchaudio_ffmpeg4.so +0 -0
  54. torchaudio/lib/libtorchaudio_ffmpeg5.so +0 -0
  55. torchaudio/lib/libtorchaudio_ffmpeg6.so +0 -0
  56. torchaudio/lib/libtorchaudio_sox.so +0 -0
  57. torchaudio/models/__init__.py +14 -0
  58. torchaudio/models/decoder/__init__.py +22 -7
  59. torchaudio/models/decoder/_ctc_decoder.py +123 -69
  60. torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
  61. torchaudio/models/rnnt_decoder.py +10 -14
  62. torchaudio/models/squim/__init__.py +11 -0
  63. torchaudio/models/squim/objective.py +326 -0
  64. torchaudio/models/squim/subjective.py +150 -0
  65. torchaudio/models/wav2vec2/components.py +6 -10
  66. torchaudio/pipelines/__init__.py +9 -0
  67. torchaudio/pipelines/_squim_pipeline.py +176 -0
  68. torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
  69. torchaudio/pipelines/_wav2vec2/impl.py +198 -68
  70. torchaudio/pipelines/_wav2vec2/utils.py +120 -0
  71. torchaudio/sox_effects/sox_effects.py +7 -30
  72. torchaudio/transforms/__init__.py +2 -0
  73. torchaudio/transforms/_transforms.py +99 -54
  74. torchaudio/utils/download.py +2 -2
  75. torchaudio/utils/ffmpeg_utils.py +20 -15
  76. torchaudio/utils/sox_utils.py +8 -9
  77. torchaudio/version.py +2 -2
  78. torchaudio-2.1.1.dist-info/METADATA +113 -0
  79. torchaudio-2.1.1.dist-info/RECORD +117 -0
  80. {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/WHEEL +1 -1
  81. torchaudio/io/_compat.py +0 -241
  82. torchaudio/lib/_torchaudio_ffmpeg.so +0 -0
  83. torchaudio/lib/flashlight_lib_text_decoder.so +0 -0
  84. torchaudio/lib/flashlight_lib_text_dictionary.so +0 -0
  85. torchaudio/lib/libflashlight-text.so +0 -0
  86. torchaudio/lib/libtorchaudio_ffmpeg.so +0 -0
  87. torchaudio-2.0.2.dist-info/METADATA +0 -26
  88. torchaudio-2.0.2.dist-info/RECORD +0 -100
  89. {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/LICENSE +0 -0
  90. {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/top_level.txt +0 -0
@@ -36,7 +36,7 @@ class Spectrogram(torch.nn.Module):
36
36
  window_fn (Callable[..., Tensor], optional): A function to create a window tensor
37
37
  that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
38
38
  power (float or None, optional): Exponent for the magnitude spectrogram,
39
- (must be > 0) e.g., 1 for energy, 2 for power, etc.
39
+ (must be > 0) e.g., 1 for magnitude, 2 for power, etc.
40
40
  If None, then the complex spectrum is returned instead. (Default: ``2``)
41
41
  normalized (bool or str, optional): Whether to normalize by magnitude after stft. If input is str, choices are
42
42
  ``"window"`` and ``"frame_length"``, if specific normalization type is desirable. ``True`` maps to
@@ -227,7 +227,7 @@ class GriffinLim(torch.nn.Module):
227
227
  window_fn (Callable[..., Tensor], optional): A function to create a window tensor
228
228
  that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
229
229
  power (float, optional): Exponent for the magnitude spectrogram,
230
- (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
230
+ (must be > 0) e.g., 1 for magnitude, 2 for power, etc. (Default: ``2``)
231
231
  wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
232
232
  momentum (float, optional): The momentum parameter for fast Griffin-Lim.
233
233
  Setting this to 0 recovers the original Griffin-Lim method.
@@ -420,7 +420,7 @@ class InverseMelScale(torch.nn.Module):
420
420
  .. devices:: CPU CUDA
421
421
 
422
422
  It minimizes the euclidian norm between the input mel-spectrogram and the product between
423
- the estimated spectrogram and the filter banks using SGD.
423
+ the estimated spectrogram and the filter banks using `torch.linalg.lstsq`.
424
424
 
425
425
  Args:
426
426
  n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
@@ -428,13 +428,13 @@ class InverseMelScale(torch.nn.Module):
428
428
  sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
429
429
  f_min (float, optional): Minimum frequency. (Default: ``0.``)
430
430
  f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
431
- max_iter (int, optional): Maximum number of optimization iterations. (Default: ``100000``)
432
- tolerance_loss (float, optional): Value of loss to stop optimization at. (Default: ``1e-5``)
433
- tolerance_change (float, optional): Difference in losses to stop optimization at. (Default: ``1e-8``)
434
- sgdargs (dict or None, optional): Arguments for the SGD optimizer. (Default: ``None``)
435
431
  norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
436
432
  (area normalization). (Default: ``None``)
437
433
  mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
434
+ driver (str, optional): Name of the LAPACK/MAGMA method to be used for `torch.lstsq`.
435
+ For CPU inputs the valid values are ``"gels"``, ``"gelsy"``, ``"gelsd"``, ``"gelss"``.
436
+ For CUDA input, the only valid driver is ``"gels"``, which assumes that A is full-rank.
437
+ (Default: ``"gels``)
438
438
 
439
439
  Example
440
440
  >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
@@ -449,10 +449,6 @@ class InverseMelScale(torch.nn.Module):
449
449
  "sample_rate",
450
450
  "f_min",
451
451
  "f_max",
452
- "max_iter",
453
- "tolerance_loss",
454
- "tolerance_change",
455
- "sgdargs",
456
452
  ]
457
453
 
458
454
  def __init__(
@@ -462,26 +458,23 @@ class InverseMelScale(torch.nn.Module):
462
458
  sample_rate: int = 16000,
463
459
  f_min: float = 0.0,
464
460
  f_max: Optional[float] = None,
465
- max_iter: int = 100000,
466
- tolerance_loss: float = 1e-5,
467
- tolerance_change: float = 1e-8,
468
- sgdargs: Optional[dict] = None,
469
461
  norm: Optional[str] = None,
470
462
  mel_scale: str = "htk",
463
+ driver: str = "gels",
471
464
  ) -> None:
472
465
  super(InverseMelScale, self).__init__()
473
466
  self.n_mels = n_mels
474
467
  self.sample_rate = sample_rate
475
468
  self.f_max = f_max or float(sample_rate // 2)
476
469
  self.f_min = f_min
477
- self.max_iter = max_iter
478
- self.tolerance_loss = tolerance_loss
479
- self.tolerance_change = tolerance_change
480
- self.sgdargs = sgdargs or {"lr": 0.1, "momentum": 0.9}
470
+ self.driver = driver
481
471
 
482
472
  if f_min > self.f_max:
483
473
  raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
484
474
 
475
+ if driver not in ["gels", "gelsy", "gelsd", "gelss"]:
476
+ raise ValueError(f'driver must be one of ["gels", "gelsy", "gelsd", "gelss"]. Found {driver}.')
477
+
485
478
  fb = F.melscale_fbanks(n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, norm, mel_scale)
486
479
  self.register_buffer("fb", fb)
487
480
 
@@ -499,34 +492,10 @@ class InverseMelScale(torch.nn.Module):
499
492
 
500
493
  n_mels, time = shape[-2], shape[-1]
501
494
  freq, _ = self.fb.size() # (freq, n_mels)
502
- melspec = melspec.transpose(-1, -2)
503
495
  if self.n_mels != n_mels:
504
496
  raise ValueError("Expected an input with {} mel bins. Found: {}".format(self.n_mels, n_mels))
505
497
 
506
- specgram = torch.rand(
507
- melspec.size()[0], time, freq, requires_grad=True, dtype=melspec.dtype, device=melspec.device
508
- )
509
-
510
- optim = torch.optim.SGD([specgram], **self.sgdargs)
511
-
512
- loss = float("inf")
513
- for _ in range(self.max_iter):
514
- optim.zero_grad()
515
- diff = melspec - specgram.matmul(self.fb)
516
- new_loss = diff.pow(2).sum(axis=-1).mean()
517
- # take sum over mel-frequency then average over other dimensions
518
- # so that loss threshold is applied par unit timeframe
519
- new_loss.backward()
520
- optim.step()
521
- specgram.data = specgram.data.clamp(min=0)
522
-
523
- new_loss = new_loss.item()
524
- if new_loss < self.tolerance_loss or abs(loss - new_loss) < self.tolerance_change:
525
- break
526
- loss = new_loss
527
-
528
- specgram.requires_grad_(False)
529
- specgram = specgram.clamp(min=0).transpose(-1, -2)
498
+ specgram = torch.relu(torch.linalg.lstsq(self.fb.transpose(-1, -2)[None], melspec, driver=self.driver).solution)
530
499
 
531
500
  # unpack batch
532
501
  specgram = specgram.view(shape[:-2] + (freq, time))
@@ -540,7 +509,7 @@ class MelSpectrogram(torch.nn.Module):
540
509
 
541
510
  .. properties:: Autograd TorchScript
542
511
 
543
- This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and
512
+ This is a composition of :py:func:`torchaudio.transforms.Spectrogram`
544
513
  and :py:func:`torchaudio.transforms.MelScale`.
545
514
 
546
515
  Sources
@@ -560,7 +529,7 @@ class MelSpectrogram(torch.nn.Module):
560
529
  window_fn (Callable[..., Tensor], optional): A function to create a window tensor
561
530
  that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
562
531
  power (float, optional): Exponent for the magnitude spectrogram,
563
- (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
532
+ (must be > 0) e.g., 1 for magnitude, 2 for power, etc. (Default: ``2``)
564
533
  normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
565
534
  wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
566
535
  center (bool, optional): whether to pad :attr:`waveform` on both sides so
@@ -1196,15 +1165,16 @@ class _AxisMasking(torch.nn.Module):
1196
1165
 
1197
1166
  Args:
1198
1167
  mask_param (int): Maximum possible length of the mask.
1199
- axis (int): What dimension the mask is applied on.
1168
+ axis (int): What dimension the mask is applied on (assuming the tensor is 3D).
1169
+ For frequency masking, axis = 1.
1170
+ For time masking, axis = 2.
1200
1171
  iid_masks (bool): Applies iid masks to each of the examples in the batch dimension.
1201
- This option is applicable only when the input tensor is 4D.
1172
+ This option is applicable only when the dimension of the input tensor is >= 3.
1202
1173
  p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
1203
1174
  """
1204
1175
  __constants__ = ["mask_param", "axis", "iid_masks", "p"]
1205
1176
 
1206
1177
  def __init__(self, mask_param: int, axis: int, iid_masks: bool, p: float = 1.0) -> None:
1207
-
1208
1178
  super(_AxisMasking, self).__init__()
1209
1179
  self.mask_param = mask_param
1210
1180
  self.axis = axis
@@ -1221,10 +1191,14 @@ class _AxisMasking(torch.nn.Module):
1221
1191
  Tensor: Masked spectrogram of dimensions `(..., freq, time)`.
1222
1192
  """
1223
1193
  # if iid_masks flag marked and specgram has a batch dimension
1224
- if self.iid_masks and specgram.dim() == 4:
1225
- return F.mask_along_axis_iid(specgram, self.mask_param, mask_value, self.axis + 1, p=self.p)
1194
+ # self.axis + specgram.dim() - 3 gives the time/frequency dimension (last two dimensions)
1195
+ # for input tensor for which the dimension is not 3.
1196
+ if self.iid_masks:
1197
+ return F.mask_along_axis_iid(
1198
+ specgram, self.mask_param, mask_value, self.axis + specgram.dim() - 3, p=self.p
1199
+ )
1226
1200
  else:
1227
- return F.mask_along_axis(specgram, self.mask_param, mask_value, self.axis, p=self.p)
1201
+ return F.mask_along_axis(specgram, self.mask_param, mask_value, self.axis + specgram.dim() - 3, p=self.p)
1228
1202
 
1229
1203
 
1230
1204
  class FrequencyMasking(_AxisMasking):
@@ -1241,7 +1215,7 @@ class FrequencyMasking(_AxisMasking):
1241
1215
  Indices uniformly sampled from [0, freq_mask_param).
1242
1216
  iid_masks (bool, optional): whether to apply different masks to each
1243
1217
  example/channel in the batch. (Default: ``False``)
1244
- This option is applicable only when the input tensor is 4D.
1218
+ This option is applicable only when the input tensor >= 3D.
1245
1219
 
1246
1220
  Example
1247
1221
  >>> spectrogram = torchaudio.transforms.Spectrogram()
@@ -1275,7 +1249,7 @@ class TimeMasking(_AxisMasking):
1275
1249
  Indices uniformly sampled from [0, time_mask_param).
1276
1250
  iid_masks (bool, optional): whether to apply different masks to each
1277
1251
  example/channel in the batch. (Default: ``False``)
1278
- This option is applicable only when the input tensor is 4D.
1252
+ This option is applicable only when the input tensor >= 3D.
1279
1253
  p (float, optional): maximum proportion of time steps that can be masked.
1280
1254
  Must be within range [0.0, 1.0]. (Default: 1.0)
1281
1255
 
@@ -1299,6 +1273,77 @@ class TimeMasking(_AxisMasking):
1299
1273
  super(TimeMasking, self).__init__(time_mask_param, 2, iid_masks, p=p)
1300
1274
 
1301
1275
 
1276
+ class SpecAugment(torch.nn.Module):
1277
+ r"""Apply time and frequency masking to a spectrogram.
1278
+ Args:
1279
+ n_time_masks (int): Number of time masks. If its value is zero, no time masking will be applied.
1280
+ time_mask_param (int): Maximum possible length of the time mask.
1281
+ n_freq_masks (int): Number of frequency masks. If its value is zero, no frequency masking will be applied.
1282
+ freq_mask_param (int): Maximum possible length of the frequency mask.
1283
+ iid_masks (bool, optional): Applies iid masks to each of the examples in the batch dimension.
1284
+ This option is applicable only when the input tensor is 4D. (Default: ``True``)
1285
+ p (float, optional): maximum proportion of time steps that can be masked.
1286
+ Must be within range [0.0, 1.0]. (Default: 1.0)
1287
+ zero_masking (bool, optional): If ``True``, use 0 as the mask value,
1288
+ else use mean of the input tensor. (Default: ``False``)
1289
+ """
1290
+ __constants__ = [
1291
+ "n_time_masks",
1292
+ "time_mask_param",
1293
+ "n_freq_masks",
1294
+ "freq_mask_param",
1295
+ "iid_masks",
1296
+ "p",
1297
+ "zero_masking",
1298
+ ]
1299
+
1300
+ def __init__(
1301
+ self,
1302
+ n_time_masks: int,
1303
+ time_mask_param: int,
1304
+ n_freq_masks: int,
1305
+ freq_mask_param: int,
1306
+ iid_masks: bool = True,
1307
+ p: float = 1.0,
1308
+ zero_masking: bool = False,
1309
+ ) -> None:
1310
+ super(SpecAugment, self).__init__()
1311
+ self.n_time_masks = n_time_masks
1312
+ self.time_mask_param = time_mask_param
1313
+ self.n_freq_masks = n_freq_masks
1314
+ self.freq_mask_param = freq_mask_param
1315
+ self.iid_masks = iid_masks
1316
+ self.p = p
1317
+ self.zero_masking = zero_masking
1318
+
1319
+ def forward(self, specgram: Tensor) -> Tensor:
1320
+ r"""
1321
+ Args:
1322
+ specgram (Tensor): Tensor of shape `(..., freq, time)`.
1323
+ Returns:
1324
+ Tensor: Masked spectrogram of shape `(..., freq, time)`.
1325
+ """
1326
+ if self.zero_masking:
1327
+ mask_value = 0.0
1328
+ else:
1329
+ mask_value = specgram.mean()
1330
+ time_dim = specgram.dim() - 1
1331
+ freq_dim = time_dim - 1
1332
+
1333
+ if specgram.dim() > 2 and self.iid_masks is True:
1334
+ for _ in range(self.n_time_masks):
1335
+ specgram = F.mask_along_axis_iid(specgram, self.time_mask_param, mask_value, time_dim, p=self.p)
1336
+ for _ in range(self.n_freq_masks):
1337
+ specgram = F.mask_along_axis_iid(specgram, self.freq_mask_param, mask_value, freq_dim, p=self.p)
1338
+ else:
1339
+ for _ in range(self.n_time_masks):
1340
+ specgram = F.mask_along_axis(specgram, self.time_mask_param, mask_value, time_dim, p=self.p)
1341
+ for _ in range(self.n_freq_masks):
1342
+ specgram = F.mask_along_axis(specgram, self.freq_mask_param, mask_value, freq_dim, p=self.p)
1343
+
1344
+ return specgram
1345
+
1346
+
1302
1347
  class Loudness(torch.nn.Module):
1303
1348
  r"""Measure audio loudness according to the ITU-R BS.1770-4 recommendation.
1304
1349
 
@@ -5,7 +5,7 @@ from pathlib import Path
5
5
  from typing import Union
6
6
 
7
7
  import torch
8
-
8
+ from torchaudio._internal import download_url_to_file
9
9
 
10
10
  _LG = logging.getLogger(__name__)
11
11
 
@@ -18,7 +18,7 @@ def _get_local_path(key):
18
18
 
19
19
  def _download(key, path, progress):
20
20
  url = f"https://download.pytorch.org/torchaudio/{key}"
21
- torch.hub.download_url_to_file(url, path, progress=progress)
21
+ download_url_to_file(url, path, progress=progress)
22
22
 
23
23
 
24
24
  def _get_hash(path, hash, chunk_size=1028):
@@ -4,7 +4,6 @@ It affects functionalities in :py:mod:`torchaudio.io` (and indirectly :py:func:`
4
4
  """
5
5
  from typing import Dict, List, Tuple
6
6
 
7
- import torch
8
7
  import torchaudio
9
8
 
10
9
 
@@ -16,7 +15,7 @@ def get_versions() -> Dict[str, Tuple[int]]:
16
15
  dict: mapping from library names to version string,
17
16
  i.e. `"libavutil": (56, 22, 100)`.
18
17
  """
19
- return torch.ops.torchaudio.ffmpeg_get_versions()
18
+ return torchaudio._extension._FFMPEG_EXT.get_versions()
20
19
 
21
20
 
22
21
  @torchaudio._extension.fail_if_no_ffmpeg
@@ -25,7 +24,7 @@ def get_log_level() -> int:
25
24
 
26
25
  See :py:func:`set_log_level` for the detailo.
27
26
  """
28
- return torch.ops.torchaudio.ffmpeg_get_log_level()
27
+ return torchaudio._extension._FFMPEG_EXT.get_log_level()
29
28
 
30
29
 
31
30
  @torchaudio._extension.fail_if_no_ffmpeg
@@ -62,7 +61,7 @@ def set_log_level(level: int):
62
61
  Extremely verbose debugging, useful for libav* development.
63
62
 
64
63
  """
65
- torch.ops.torchaudio.ffmpeg_set_log_level(level)
64
+ torchaudio._extension._FFMPEG_EXT.set_log_level(level)
66
65
 
67
66
 
68
67
  @torchaudio._extension.fail_if_no_ffmpeg
@@ -80,7 +79,7 @@ def get_demuxers() -> Dict[str, str]:
80
79
  ... aax: CRI AAX
81
80
  ... ac3: raw AC-3
82
81
  """
83
- return torch.ops.torchaudio.ffmpeg_get_demuxers()
82
+ return torchaudio._extension._FFMPEG_EXT.get_demuxers()
84
83
 
85
84
 
86
85
  @torchaudio._extension.fail_if_no_ffmpeg
@@ -99,7 +98,7 @@ def get_muxers() -> Dict[str, str]:
99
98
  ... adx: CRI ADX
100
99
  ... aiff: Audio IFF
101
100
  """
102
- return torch.ops.torchaudio.ffmpeg_get_muxers()
101
+ return torchaudio._extension._FFMPEG_EXT.get_muxers()
103
102
 
104
103
 
105
104
  @torchaudio._extension.fail_if_no_ffmpeg
@@ -118,7 +117,7 @@ def get_audio_decoders() -> Dict[str, str]:
118
117
  ... adx: CRI ADX
119
118
  ... aiff: Audio IFF
120
119
  """
121
- return torch.ops.torchaudio.ffmpeg_get_audio_decoders()
120
+ return torchaudio._extension._FFMPEG_EXT.get_audio_decoders()
122
121
 
123
122
 
124
123
  @torchaudio._extension.fail_if_no_ffmpeg
@@ -138,7 +137,7 @@ def get_audio_encoders() -> Dict[str, str]:
138
137
  ... ac3_fixed: ATSC A/52A (AC-3)
139
138
  ... alac: ALAC (Apple Lossless Audio Codec)
140
139
  """
141
- return torch.ops.torchaudio.ffmpeg_get_audio_encoders()
140
+ return torchaudio._extension._FFMPEG_EXT.get_audio_encoders()
142
141
 
143
142
 
144
143
  @torchaudio._extension.fail_if_no_ffmpeg
@@ -158,7 +157,7 @@ def get_video_decoders() -> Dict[str, str]:
158
157
  ... amv: AMV Video
159
158
  ... anm: Deluxe Paint Animation
160
159
  """
161
- return torch.ops.torchaudio.ffmpeg_get_video_decoders()
160
+ return torchaudio._extension._FFMPEG_EXT.get_video_decoders()
162
161
 
163
162
 
164
163
  @torchaudio._extension.fail_if_no_ffmpeg
@@ -179,7 +178,7 @@ def get_video_encoders() -> Dict[str, str]:
179
178
  ... asv1: ASUS V1
180
179
  ... asv2: ASUS V2
181
180
  """
182
- return torch.ops.torchaudio.ffmpeg_get_video_encoders()
181
+ return torchaudio._extension._FFMPEG_EXT.get_video_encoders()
183
182
 
184
183
 
185
184
  @torchaudio._extension.fail_if_no_ffmpeg
@@ -195,7 +194,7 @@ def get_input_devices() -> Dict[str, str]:
195
194
  ... avfoundation: AVFoundation input device
196
195
  ... lavfi: Libavfilter virtual input device
197
196
  """
198
- return torch.ops.torchaudio.ffmpeg_get_input_devices()
197
+ return torchaudio._extension._FFMPEG_EXT.get_input_devices()
199
198
 
200
199
 
201
200
  @torchaudio._extension.fail_if_no_ffmpeg
@@ -210,7 +209,7 @@ def get_output_devices() -> Dict[str, str]:
210
209
  >>> print(f"{k}: {v}")
211
210
  ... audiotoolbox: AudioToolbox output device
212
211
  """
213
- return torch.ops.torchaudio.ffmpeg_get_output_devices()
212
+ return torchaudio._extension._FFMPEG_EXT.get_output_devices()
214
213
 
215
214
 
216
215
  @torchaudio._extension.fail_if_no_ffmpeg
@@ -224,7 +223,7 @@ def get_input_protocols() -> List[str]:
224
223
  >>> print(get_input_protocols())
225
224
  ... ['file', 'ftp', 'hls', 'http','https', 'pipe', 'rtmp', 'tcp', 'tls', 'udp', 'unix']
226
225
  """
227
- return torch.ops.torchaudio.ffmpeg_get_input_protocols()
226
+ return torchaudio._extension._FFMPEG_EXT.get_input_protocols()
228
227
 
229
228
 
230
229
  @torchaudio._extension.fail_if_no_ffmpeg
@@ -238,7 +237,7 @@ def get_output_protocols() -> List[str]:
238
237
  >>> print(get_output_protocols())
239
238
  ... ['file', 'ftp', 'http', 'https', 'md5', 'pipe', 'prompeg', 'rtmp', 'tee', 'tcp', 'tls', 'udp', 'unix']
240
239
  """
241
- return torch.ops.torchaudio.ffmpeg_get_output_protocols()
240
+ return torchaudio._extension._FFMPEG_EXT.get_output_protocols()
242
241
 
243
242
 
244
243
  @torchaudio._extension.fail_if_no_ffmpeg
@@ -252,4 +251,10 @@ def get_build_config() -> str:
252
251
  >>> print(get_build_config())
253
252
  --prefix=/Users/runner/miniforge3 --cc=arm64-apple-darwin20.0.0-clang --enable-gpl --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-neon --enable-libx264 --enable-libx265 --enable-libaom --enable-libsvtav1 --enable-libxml2 --enable-libvpx --enable-pic --enable-pthreads --enable-shared --disable-static --enable-version3 --enable-zlib --enable-libmp3lame --pkg-config=/Users/runner/miniforge3/conda-bld/ffmpeg_1646229390493/_build_env/bin/pkg-config --enable-cross-compile --arch=arm64 --target-os=darwin --cross-prefix=arm64-apple-darwin20.0.0- --host-cc=/Users/runner/miniforge3/conda-bld/ffmpeg_1646229390493/_build_env/bin/x86_64-apple-darwin13.4.0-clang # noqa
254
253
  """
255
- return torch.ops.torchaudio.ffmpeg_get_build_config()
254
+ return torchaudio._extension._FFMPEG_EXT.get_build_config()
255
+
256
+
257
+ @torchaudio._extension.fail_if_no_ffmpeg
258
+ def clear_cuda_context_cache():
259
+ """Clear the CUDA context used by CUDA Hardware accelerated video decoding"""
260
+ torchaudio._extension._FFMPEG_EXT.clear_cuda_context_cache()
@@ -4,7 +4,6 @@
4
4
 
5
5
  from typing import Dict, List
6
6
 
7
- import torch
8
7
  import torchaudio
9
8
 
10
9
 
@@ -18,7 +17,7 @@ def set_seed(seed: int):
18
17
  See Also:
19
18
  http://sox.sourceforge.net/sox.html
20
19
  """
21
- torch.ops.torchaudio.sox_utils_set_seed(seed)
20
+ torchaudio.lib._torchaudio_sox.set_seed(seed)
22
21
 
23
22
 
24
23
  @torchaudio._extension.fail_if_no_sox
@@ -36,7 +35,7 @@ def set_verbosity(verbosity: int):
36
35
  See Also:
37
36
  http://sox.sourceforge.net/sox.html
38
37
  """
39
- torch.ops.torchaudio.sox_utils_set_verbosity(verbosity)
38
+ torchaudio.lib._torchaudio_sox.set_verbosity(verbosity)
40
39
 
41
40
 
42
41
  @torchaudio._extension.fail_if_no_sox
@@ -49,7 +48,7 @@ def set_buffer_size(buffer_size: int):
49
48
  See Also:
50
49
  http://sox.sourceforge.net/sox.html
51
50
  """
52
- torch.ops.torchaudio.sox_utils_set_buffer_size(buffer_size)
51
+ torchaudio.lib._torchaudio_sox.set_buffer_size(buffer_size)
53
52
 
54
53
 
55
54
  @torchaudio._extension.fail_if_no_sox
@@ -63,7 +62,7 @@ def set_use_threads(use_threads: bool):
63
62
  See Also:
64
63
  http://sox.sourceforge.net/sox.html
65
64
  """
66
- torch.ops.torchaudio.sox_utils_set_use_threads(use_threads)
65
+ torchaudio.lib._torchaudio_sox.set_use_threads(use_threads)
67
66
 
68
67
 
69
68
  @torchaudio._extension.fail_if_no_sox
@@ -73,7 +72,7 @@ def list_effects() -> Dict[str, str]:
73
72
  Returns:
74
73
  Dict[str, str]: Mapping from ``effect name`` to ``usage``
75
74
  """
76
- return dict(torch.ops.torchaudio.sox_utils_list_effects())
75
+ return dict(torchaudio.lib._torchaudio_sox.list_effects())
77
76
 
78
77
 
79
78
  @torchaudio._extension.fail_if_no_sox
@@ -83,7 +82,7 @@ def list_read_formats() -> List[str]:
83
82
  Returns:
84
83
  List[str]: List of supported audio formats
85
84
  """
86
- return torch.ops.torchaudio.sox_utils_list_read_formats()
85
+ return torchaudio.lib._torchaudio_sox.list_read_formats()
87
86
 
88
87
 
89
88
  @torchaudio._extension.fail_if_no_sox
@@ -93,7 +92,7 @@ def list_write_formats() -> List[str]:
93
92
  Returns:
94
93
  List[str]: List of supported audio formats
95
94
  """
96
- return torch.ops.torchaudio.sox_utils_list_write_formats()
95
+ return torchaudio.lib._torchaudio_sox.list_write_formats()
97
96
 
98
97
 
99
98
  @torchaudio._extension.fail_if_no_sox
@@ -103,4 +102,4 @@ def get_buffer_size() -> int:
103
102
  Returns:
104
103
  int: size in bytes of buffers used for processing audio.
105
104
  """
106
- return torch.ops.torchaudio.sox_utils_get_buffer_size()
105
+ return torchaudio.lib._torchaudio_sox.get_buffer_size()
torchaudio/version.py CHANGED
@@ -1,2 +1,2 @@
1
- __version__ = '2.0.2'
2
- git_version = '701239f864183e6490c62e0c54343ff9d921e20f'
1
+ __version__ = '2.1.1'
2
+ git_version = '5784206b90d738de888dce4c99b8b46be213f019'
@@ -0,0 +1,113 @@
1
+ Metadata-Version: 2.1
2
+ Name: torchaudio
3
+ Version: 2.1.1
4
+ Summary: An audio package for PyTorch
5
+ Home-page: https://github.com/pytorch/audio
6
+ Author: Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang
7
+ Author-email: soumith@pytorch.org
8
+ Maintainer: Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang
9
+ Maintainer-email: moto@meta.com
10
+ Classifier: Environment :: Plugins
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: BSD License
14
+ Classifier: Operating System :: MacOS :: MacOS X
15
+ Classifier: Operating System :: Microsoft :: Windows
16
+ Classifier: Operating System :: POSIX
17
+ Classifier: Programming Language :: C++
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: Implementation :: CPython
23
+ Classifier: Topic :: Multimedia :: Sound/Audio
24
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: torch (==2.1.1)
28
+
29
+ torchaudio: an audio library for PyTorch
30
+ ========================================
31
+
32
+ [![Documentation](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchaudio%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/audio/main/)
33
+ [![Anaconda Badge](https://anaconda.org/pytorch/torchaudio/badges/downloads.svg)](https://anaconda.org/pytorch/torchaudio)
34
+ [![Anaconda-Server Badge](https://anaconda.org/pytorch/torchaudio/badges/platforms.svg)](https://anaconda.org/pytorch/torchaudio)
35
+
36
+ ![TorchAudio Logo](docs/source/_static/img/logo.png)
37
+
38
+ The aim of torchaudio is to apply [PyTorch](https://github.com/pytorch/pytorch) to
39
+ the audio domain. By supporting PyTorch, torchaudio follows the same philosophy
40
+ of providing strong GPU acceleration, having a focus on trainable features through
41
+ the autograd system, and having consistent style (tensor names and dimension names).
42
+ Therefore, it is primarily a machine learning library and not a general signal
43
+ processing library. The benefits of PyTorch can be seen in torchaudio through
44
+ having all the computations be through PyTorch operations which makes it easy
45
+ to use and feel like a natural extension.
46
+
47
+ - [Support audio I/O (Load files, Save files)](http://pytorch.org/audio/main/)
48
+ - Load a variety of audio formats, such as `wav`, `mp3`, `ogg`, `flac`, `opus`, `sphere`, into a torch Tensor using SoX
49
+ - [Kaldi (ark/scp)](http://pytorch.org/audio/main/kaldi_io.html)
50
+ - [Dataloaders for common audio datasets](http://pytorch.org/audio/main/datasets.html)
51
+ - Audio and speech processing functions
52
+ - [forced_align](https://pytorch.org/audio/main/generated/torchaudio.functional.forced_align.html)
53
+ - Common audio transforms
54
+ - [Spectrogram, AmplitudeToDB, MelScale, MelSpectrogram, MFCC, MuLawEncoding, MuLawDecoding, Resample](http://pytorch.org/audio/main/transforms.html)
55
+ - Compliance interfaces: Run code using PyTorch that align with other libraries
56
+ - [Kaldi: spectrogram, fbank, mfcc](https://pytorch.org/audio/main/compliance.kaldi.html)
57
+
58
+ Installation
59
+ ------------
60
+
61
+ Please refer to https://pytorch.org/audio/main/installation.html for installation and build process of TorchAudio.
62
+
63
+
64
+ API Reference
65
+ -------------
66
+
67
+ API Reference is located here: http://pytorch.org/audio/main/
68
+
69
+ Contributing Guidelines
70
+ -----------------------
71
+
72
+ Please refer to [CONTRIBUTING.md](./CONTRIBUTING.md)
73
+
74
+ Citation
75
+ --------
76
+
77
+ If you find this package useful, please cite as:
78
+
79
+ ```bibtex
80
+ @article{yang2021torchaudio,
81
+ title={TorchAudio: Building Blocks for Audio and Speech Processing},
82
+ author={Yao-Yuan Yang and Moto Hira and Zhaoheng Ni and Anjali Chourdia and Artyom Astafurov and Caroline Chen and Ching-Feng Yeh and Christian Puhrsch and David Pollack and Dmitriy Genzel and Donny Greenberg and Edward Z. Yang and Jason Lian and Jay Mahadeokar and Jeff Hwang and Ji Chen and Peter Goldsborough and Prabhat Roy and Sean Narenthiran and Shinji Watanabe and Soumith Chintala and Vincent Quenneville-Bélair and Yangyang Shi},
83
+ journal={arXiv preprint arXiv:2110.15018},
84
+ year={2021}
85
+ }
86
+ ```
87
+
88
+ ```bibtex
89
+ @misc{hwang2023torchaudio,
90
+ title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch},
91
+ author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao and Robin Scheibler and Samuele Cornell and Sean Kim and Stavros Petridis},
92
+ year={2023},
93
+ eprint={2310.17864},
94
+ archivePrefix={arXiv},
95
+ primaryClass={eess.AS}
96
+ }
97
+ ```
98
+
99
+ Disclaimer on Datasets
100
+ ----------------------
101
+
102
+ This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license.
103
+
104
+ If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML community!
105
+
106
+ Pre-trained Model License
107
+ -------------------------
108
+
109
+ The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the dataset used for training. It is your responsibility to determine whether you have permission to use the models for your use case.
110
+
111
+ For instance, SquimSubjective model is released under the Creative Commons Attribution Non Commercial 4.0 International (CC-BY-NC 4.0) license. See [the link](https://zenodo.org/record/4660670#.ZBtWPOxuerN) for additional details.
112
+
113
+ Other pre-trained models that have different license are noted in documentation. Please checkout the [documentation page](https://pytorch.org/audio/main/).