torchaudio 2.7.1__cp313-cp313t-win_amd64.whl → 2.9.0__cp313-cp313t-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show
  1. torchaudio/__init__.py +184 -33
  2. torchaudio/_extension/__init__.py +1 -14
  3. torchaudio/_extension/utils.py +0 -47
  4. torchaudio/_internal/module_utils.py +68 -10
  5. torchaudio/_torchcodec.py +340 -0
  6. torchaudio/datasets/cmuarctic.py +1 -1
  7. torchaudio/datasets/utils.py +1 -1
  8. torchaudio/functional/__init__.py +6 -3
  9. torchaudio/functional/_alignment.py +1 -1
  10. torchaudio/functional/filtering.py +70 -55
  11. torchaudio/functional/functional.py +31 -61
  12. torchaudio/lib/_torchaudio.pyd +0 -0
  13. torchaudio/lib/libtorchaudio.pyd +0 -0
  14. torchaudio/models/decoder/__init__.py +19 -1
  15. torchaudio/models/decoder/_ctc_decoder.py +6 -6
  16. torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
  17. torchaudio/models/squim/objective.py +2 -2
  18. torchaudio/pipelines/_source_separation_pipeline.py +1 -1
  19. torchaudio/pipelines/_squim_pipeline.py +2 -2
  20. torchaudio/pipelines/_tts/utils.py +3 -1
  21. torchaudio/pipelines/rnnt_pipeline.py +4 -4
  22. torchaudio/transforms/__init__.py +4 -1
  23. torchaudio/transforms/_transforms.py +4 -3
  24. torchaudio/utils/__init__.py +2 -9
  25. torchaudio/utils/download.py +1 -1
  26. torchaudio/version.py +2 -2
  27. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/METADATA +15 -7
  28. torchaudio-2.9.0.dist-info/RECORD +85 -0
  29. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
  30. torchaudio/_backend/__init__.py +0 -61
  31. torchaudio/_backend/backend.py +0 -53
  32. torchaudio/_backend/common.py +0 -52
  33. torchaudio/_backend/ffmpeg.py +0 -334
  34. torchaudio/_backend/soundfile.py +0 -54
  35. torchaudio/_backend/soundfile_backend.py +0 -457
  36. torchaudio/_backend/sox.py +0 -91
  37. torchaudio/_backend/utils.py +0 -317
  38. torchaudio/backend/__init__.py +0 -8
  39. torchaudio/backend/_no_backend.py +0 -25
  40. torchaudio/backend/_sox_io_backend.py +0 -294
  41. torchaudio/backend/common.py +0 -13
  42. torchaudio/backend/no_backend.py +0 -14
  43. torchaudio/backend/soundfile_backend.py +0 -14
  44. torchaudio/backend/sox_io_backend.py +0 -14
  45. torchaudio/io/__init__.py +0 -13
  46. torchaudio/io/_effector.py +0 -347
  47. torchaudio/io/_playback.py +0 -72
  48. torchaudio/kaldi_io.py +0 -144
  49. torchaudio/prototype/__init__.py +0 -0
  50. torchaudio/prototype/datasets/__init__.py +0 -4
  51. torchaudio/prototype/datasets/musan.py +0 -67
  52. torchaudio/prototype/functional/__init__.py +0 -26
  53. torchaudio/prototype/functional/_dsp.py +0 -433
  54. torchaudio/prototype/functional/_rir.py +0 -379
  55. torchaudio/prototype/functional/functional.py +0 -190
  56. torchaudio/prototype/models/__init__.py +0 -36
  57. torchaudio/prototype/models/_conformer_wav2vec2.py +0 -794
  58. torchaudio/prototype/models/_emformer_hubert.py +0 -333
  59. torchaudio/prototype/models/conv_emformer.py +0 -525
  60. torchaudio/prototype/models/hifi_gan.py +0 -336
  61. torchaudio/prototype/models/rnnt.py +0 -711
  62. torchaudio/prototype/models/rnnt_decoder.py +0 -399
  63. torchaudio/prototype/pipelines/__init__.py +0 -12
  64. torchaudio/prototype/pipelines/_vggish/__init__.py +0 -3
  65. torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -233
  66. torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -82
  67. torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -228
  68. torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
  69. torchaudio/prototype/transforms/__init__.py +0 -9
  70. torchaudio/prototype/transforms/_transforms.py +0 -456
  71. torchaudio/sox_effects/__init__.py +0 -10
  72. torchaudio/sox_effects/sox_effects.py +0 -272
  73. torchaudio/utils/ffmpeg_utils.py +0 -11
  74. torchaudio/utils/sox_utils.py +0 -99
  75. torchaudio-2.7.1.dist-info/RECORD +0 -144
  76. torio/__init__.py +0 -8
  77. torio/_extension/__init__.py +0 -13
  78. torio/_extension/utils.py +0 -147
  79. torio/io/__init__.py +0 -9
  80. torio/io/_streaming_media_decoder.py +0 -978
  81. torio/io/_streaming_media_encoder.py +0 -502
  82. torio/lib/__init__.py +0 -0
  83. torio/lib/_torio_ffmpeg4.pyd +0 -0
  84. torio/lib/_torio_ffmpeg5.pyd +0 -0
  85. torio/lib/_torio_ffmpeg6.pyd +0 -0
  86. torio/lib/libtorio_ffmpeg4.pyd +0 -0
  87. torio/lib/libtorio_ffmpeg5.pyd +0 -0
  88. torio/lib/libtorio_ffmpeg6.pyd +0 -0
  89. torio/utils/__init__.py +0 -4
  90. torio/utils/ffmpeg_utils.py +0 -247
  91. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/LICENSE +0 -0
  92. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
@@ -1,7 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
 
3
3
  import math
4
- import tempfile
5
4
  import warnings
6
5
  from collections.abc import Sequence
7
6
  from typing import List, Optional, Tuple, Union
@@ -9,7 +8,7 @@ from typing import List, Optional, Tuple, Union
9
8
  import torch
10
9
  import torchaudio
11
10
  from torch import Tensor
12
- from torchaudio._internal.module_utils import deprecated
11
+ from torchaudio._internal.module_utils import dropping_support
13
12
 
14
13
  from .filtering import highpass_biquad, treble_biquad
15
14
 
@@ -33,7 +32,6 @@ __all__ = [
33
32
  "mask_along_axis_iid",
34
33
  "sliding_window_cmn",
35
34
  "spectral_centroid",
36
- "apply_codec",
37
35
  "resample",
38
36
  "edit_distance",
39
37
  "loudness",
@@ -816,7 +814,7 @@ def _get_mask_param(mask_param: int, p: float, axis_length: int) -> int:
816
814
  def mask_along_axis_iid(
817
815
  specgrams: Tensor,
818
816
  mask_param: int,
819
- mask_value: float,
817
+ mask_value: Union[float, Tensor],
820
818
  axis: int,
821
819
  p: float = 1.0,
822
820
  ) -> Tensor:
@@ -873,7 +871,12 @@ def mask_along_axis_iid(
873
871
 
874
872
  # Per batch example masking
875
873
  specgrams = specgrams.transpose(axis, -1)
876
- specgrams = specgrams.masked_fill((mask >= mask_start) & (mask < mask_end), mask_value)
874
+ # this aims to avoid CPU-GPU sync from upstream
875
+ specgrams = (
876
+ torch.where((mask >= mask_start) & (mask < mask_end), mask_value.repeat(specgrams.shape), specgrams)
877
+ if isinstance(mask_value, Tensor)
878
+ else specgrams.masked_fill((mask >= mask_start) & (mask < mask_end), mask_value)
879
+ )
877
880
  specgrams = specgrams.transpose(axis, -1)
878
881
 
879
882
  return specgrams
@@ -1295,51 +1298,6 @@ def spectral_centroid(
1295
1298
  return (freqs * specgram).sum(dim=freq_dim) / specgram.sum(dim=freq_dim)
1296
1299
 
1297
1300
 
1298
- @deprecated("Please migrate to :py:class:`torchaudio.io.AudioEffector`.", remove=False)
1299
- def apply_codec(
1300
- waveform: Tensor,
1301
- sample_rate: int,
1302
- format: str,
1303
- channels_first: bool = True,
1304
- compression: Optional[float] = None,
1305
- encoding: Optional[str] = None,
1306
- bits_per_sample: Optional[int] = None,
1307
- ) -> Tensor:
1308
- r"""
1309
- Apply codecs as a form of augmentation.
1310
-
1311
- .. devices:: CPU
1312
-
1313
- Args:
1314
- waveform (Tensor): Audio data. Must be 2 dimensional. See also ```channels_first```.
1315
- sample_rate (int): Sample rate of the audio waveform.
1316
- format (str): File format.
1317
- channels_first (bool, optional):
1318
- When True, both the input and output Tensor have dimension `(channel, time)`.
1319
- Otherwise, they have dimension `(time, channel)`.
1320
- compression (float or None, optional): Used for formats other than WAV.
1321
- For more details see :py:func:`torchaudio.backend.sox_io_backend.save`.
1322
- encoding (str or None, optional): Changes the encoding for the supported formats.
1323
- For more details see :py:func:`torchaudio.backend.sox_io_backend.save`.
1324
- bits_per_sample (int or None, optional): Changes the bit depth for the supported formats.
1325
- For more details see :py:func:`torchaudio.backend.sox_io_backend.save`.
1326
-
1327
- Returns:
1328
- Tensor: Resulting Tensor.
1329
- If ``channels_first=True``, it has `(channel, time)` else `(time, channel)`.
1330
- """
1331
- from torchaudio.backend import _sox_io_backend
1332
-
1333
- with tempfile.NamedTemporaryFile() as f:
1334
- torchaudio.backend._sox_io_backend.save(
1335
- f.name, waveform, sample_rate, channels_first, compression, format, encoding, bits_per_sample
1336
- )
1337
- augmented, sr = _sox_io_backend.load(f.name, channels_first=channels_first, format=format)
1338
- if sr != sample_rate:
1339
- augmented = resample(augmented, sr, sample_rate)
1340
- return augmented
1341
-
1342
-
1343
1301
  _CPU = torch.device("cpu")
1344
1302
 
1345
1303
 
@@ -1760,7 +1718,22 @@ def _fix_waveform_shape(
1760
1718
  return waveform_shift
1761
1719
 
1762
1720
 
1763
- def rnnt_loss(
1721
+ class RnntLoss(torch.autograd.Function):
1722
+ @staticmethod
1723
+ def forward(ctx, *args):
1724
+ output, saved = torch.ops.torchaudio.rnnt_loss_forward(*args)
1725
+ ctx.save_for_backward(saved)
1726
+ return output
1727
+
1728
+ @staticmethod
1729
+ def backward(ctx, dy):
1730
+ grad = ctx.saved_tensors[0]
1731
+ grad_out = dy.view((-1, 1, 1, 1))
1732
+ result = grad * grad_out
1733
+ return (result, None, None, None, None, None, None, None)
1734
+
1735
+
1736
+ def _rnnt_loss(
1764
1737
  logits: Tensor,
1765
1738
  targets: Tensor,
1766
1739
  logit_lengths: Tensor,
@@ -1802,15 +1775,7 @@ def rnnt_loss(
1802
1775
  if blank < 0: # reinterpret blank index if blank < 0.
1803
1776
  blank = logits.shape[-1] + blank
1804
1777
 
1805
- costs, _ = torch.ops.torchaudio.rnnt_loss(
1806
- logits=logits,
1807
- targets=targets,
1808
- logit_lengths=logit_lengths,
1809
- target_lengths=target_lengths,
1810
- blank=blank,
1811
- clamp=clamp,
1812
- fused_log_softmax=fused_log_softmax,
1813
- )
1778
+ costs = RnntLoss.apply(logits, targets, logit_lengths, target_lengths, blank, clamp, fused_log_softmax)
1814
1779
 
1815
1780
  if reduction == "mean":
1816
1781
  return costs.mean()
@@ -1865,6 +1830,11 @@ def psd(
1865
1830
  return psd
1866
1831
 
1867
1832
 
1833
+ # Expose both deprecated wrapper as well as original because torchscript breaks on
1834
+ # wrapped functions.
1835
+ rnnt_loss = dropping_support(_rnnt_loss)
1836
+
1837
+
1868
1838
  def _compute_mat_trace(input: torch.Tensor, dim1: int = -1, dim2: int = -2) -> torch.Tensor:
1869
1839
  r"""Compute the trace of a Tensor along ``dim1`` and ``dim2`` dimensions.
1870
1840
 
@@ -2494,7 +2464,7 @@ def deemphasis(waveform, coeff: float = 0.97) -> torch.Tensor:
2494
2464
  """
2495
2465
  a_coeffs = torch.tensor([1.0, -coeff], dtype=waveform.dtype, device=waveform.device)
2496
2466
  b_coeffs = torch.tensor([1.0, 0.0], dtype=waveform.dtype, device=waveform.device)
2497
- return torchaudio.functional.lfilter(waveform, a_coeffs=a_coeffs, b_coeffs=b_coeffs)
2467
+ return torchaudio.functional.filtering.lfilter(waveform, a_coeffs=a_coeffs, b_coeffs=b_coeffs)
2498
2468
 
2499
2469
 
2500
2470
  def frechet_distance(mu_x, sigma_x, mu_y, sigma_y):
Binary file
Binary file
@@ -1,3 +1,7 @@
1
+ import inspect
2
+
3
+ from torchaudio._internal.module_utils import dropping_class_support, dropping_support
4
+
1
5
  _CTC_DECODERS = [
2
6
  "CTCHypothesis",
3
7
  "CTCDecoder",
@@ -33,7 +37,21 @@ def __getattr__(name: str):
33
37
  "To use CUCTC decoder, please set BUILD_CUDA_CTC_DECODER=1 when building from source."
34
38
  ) from err
35
39
 
36
- item = getattr(_cuda_ctc_decoder, name)
40
+ # TODO: when all unsupported classes are removed, replace the
41
+ # following if-else block with
42
+ # item = getattr(_cuda_ctc_decoder, name)
43
+ orig_item = getattr(_cuda_ctc_decoder, name)
44
+ if inspect.isclass(orig_item) or (
45
+ # workaround a failure to detect type instances
46
+ # after sphinx autodoc mocking, required for
47
+ # building docs
48
+ getattr(orig_item, "__sphinx_mock__", False)
49
+ and inspect.isclass(orig_item.__class__)
50
+ ):
51
+ item = dropping_class_support(orig_item)
52
+ else:
53
+ item = dropping_support(orig_item)
54
+
37
55
  globals()[name] = item
38
56
  return item
39
57
  raise AttributeError(f"module {__name__} has no attribute {name}")
@@ -25,7 +25,7 @@ from flashlight.lib.text.dictionary import (
25
25
  Dictionary as _Dictionary,
26
26
  load_words as _load_words,
27
27
  )
28
- from torchaudio.utils import download_asset
28
+ from torchaudio.utils import _download_asset
29
29
 
30
30
  try:
31
31
  from flashlight.lib.text.decoder.kenlm import KenLM as _KenLM
@@ -69,7 +69,7 @@ def _get_word_dict(lexicon, lm, lm_dict, tokens_dict, unk_word):
69
69
 
70
70
  if lexicon and word_dict is None:
71
71
  word_dict = _create_word_dict(lexicon)
72
- elif not lexicon and word_dict is None and type(lm) == str:
72
+ elif not lexicon and word_dict is None and type(lm) is str:
73
73
  d = {tokens_dict.get_entry(i): [[tokens_dict.get_entry(i)]] for i in range(tokens_dict.index_size())}
74
74
  d[unk_word] = [[unk_word]]
75
75
  word_dict = _create_word_dict(d)
@@ -499,7 +499,7 @@ def ctc_decoder(
499
499
  # construct word dict and language model
500
500
  word_dict = _get_word_dict(lexicon, lm, lm_dict, tokens_dict, unk_word)
501
501
 
502
- if type(lm) == str:
502
+ if type(lm) is str:
503
503
  if _KenLM is None:
504
504
  raise RuntimeError(
505
505
  "flashlight-text is installed, but KenLM is not installed. "
@@ -554,10 +554,10 @@ def download_pretrained_files(model: str) -> _PretrainedFiles:
554
554
  """
555
555
 
556
556
  files = _get_filenames(model)
557
- lexicon_file = download_asset(files.lexicon)
558
- tokens_file = download_asset(files.tokens)
557
+ lexicon_file = _download_asset(files.lexicon)
558
+ tokens_file = _download_asset(files.tokens)
559
559
  if files.lm is not None:
560
- lm_file = download_asset(files.lm)
560
+ lm_file = _download_asset(files.lm)
561
561
  else:
562
562
  lm_file = None
563
563
 
@@ -181,7 +181,7 @@ def cuda_ctc_decoder(
181
181
  >>> )
182
182
  >>> results = decoder(log_probs, encoder_out_lens) # List of shape (B, nbest) of Hypotheses
183
183
  """
184
- if type(tokens) == str:
184
+ if type(tokens) is str:
185
185
  tokens = _get_vocab_list(tokens)
186
186
 
187
187
  return CUCTCDecoder(vocab_list=tokens, beam_size=beam_size, nbest=nbest, blank_skip_threshold=blank_skip_threshold)
@@ -285,7 +285,7 @@ def squim_objective_model(
285
285
  chunk_size: int,
286
286
  chunk_stride: Optional[int] = None,
287
287
  ) -> SquimObjective:
288
- """Build a custome :class:`torchaudio.prototype.models.SquimObjective` model.
288
+ """Build a custome :class:`torchaudio.models.squim.SquimObjective` model.
289
289
 
290
290
  Args:
291
291
  feat_dim (int, optional): The feature dimension after Encoder module.
@@ -313,7 +313,7 @@ def squim_objective_model(
313
313
 
314
314
 
315
315
  def squim_objective_base() -> SquimObjective:
316
- """Build :class:`torchaudio.prototype.models.SquimObjective` model with default arguments."""
316
+ """Build :class:`torchaudio.models.squim.SquimObjective` model with default arguments."""
317
317
  return squim_objective_model(
318
318
  feat_dim=256,
319
319
  win_len=64,
@@ -52,7 +52,7 @@ class SourceSeparationBundle:
52
52
  def get_model(self) -> torch.nn.Module:
53
53
  """Construct the model and load the pretrained weight."""
54
54
  model = self._model_factory_func()
55
- path = torchaudio.utils.download_asset(self._model_path)
55
+ path = torchaudio.utils._download_asset(self._model_path)
56
56
  state_dict = torch.load(path)
57
57
  model.load_state_dict(state_dict)
58
58
  model.eval()
@@ -50,7 +50,7 @@ class SquimObjectiveBundle:
50
50
  Variation of :py:class:`~torchaudio.models.SquimObjective`.
51
51
  """
52
52
  model = squim_objective_base()
53
- path = torchaudio.utils.download_asset(f"models/{self._path}")
53
+ path = torchaudio.utils._download_asset(f"models/{self._path}")
54
54
  state_dict = torch.load(path, weights_only=True)
55
55
  model.load_state_dict(state_dict)
56
56
  model.eval()
@@ -125,7 +125,7 @@ class SquimSubjectiveBundle:
125
125
  Variation of :py:class:`~torchaudio.models.SquimObjective`.
126
126
  """
127
127
  model = squim_subjective_base()
128
- path = torchaudio.utils.download_asset(f"models/{self._path}")
128
+ path = torchaudio.utils._download_asset(f"models/{self._path}")
129
129
  state_dict = torch.load(path, weights_only=True)
130
130
  model.load_state_dict(state_dict)
131
131
  model.eval()
@@ -161,6 +161,7 @@ def _load_phonemizer(file, dl_kwargs):
161
161
  raise RuntimeError("DeepPhonemizer is not installed. Please install it.")
162
162
 
163
163
  from dp.phonemizer import Phonemizer
164
+ from dp.preprocessing.text import LanguageTokenizer, Preprocessor, SequenceTokenizer
164
165
 
165
166
  # By default, dp issues DEBUG level log.
166
167
  logger = logging.getLogger("dp")
@@ -174,7 +175,8 @@ def _load_phonemizer(file, dl_kwargs):
174
175
  if not os.path.exists(path):
175
176
  dl_kwargs = {} if dl_kwargs is None else dl_kwargs
176
177
  download_url_to_file(url, path, **dl_kwargs)
177
- return Phonemizer.from_checkpoint(path)
178
+ with torch.serialization.safe_globals([Preprocessor, LanguageTokenizer, SequenceTokenizer]):
179
+ return Phonemizer.from_checkpoint(path)
178
180
  finally:
179
181
  logger.setLevel(orig_level)
180
182
 
@@ -244,7 +244,7 @@ class RNNTBundle:
244
244
 
245
245
  def _get_model(self) -> RNNT:
246
246
  model = self._rnnt_factory_func()
247
- path = torchaudio.utils.download_asset(self._rnnt_path)
247
+ path = torchaudio.utils._download_asset(self._rnnt_path)
248
248
  state_dict = torch.load(path)
249
249
  model.load_state_dict(state_dict)
250
250
  model.eval()
@@ -313,7 +313,7 @@ class RNNTBundle:
313
313
  Returns:
314
314
  FeatureExtractor
315
315
  """
316
- local_path = torchaudio.utils.download_asset(self._global_stats_path)
316
+ local_path = torchaudio.utils._download_asset(self._global_stats_path)
317
317
  return _ModuleFeatureExtractor(
318
318
  torch.nn.Sequential(
319
319
  torchaudio.transforms.MelSpectrogram(
@@ -332,7 +332,7 @@ class RNNTBundle:
332
332
  Returns:
333
333
  FeatureExtractor
334
334
  """
335
- local_path = torchaudio.utils.download_asset(self._global_stats_path)
335
+ local_path = torchaudio.utils._download_asset(self._global_stats_path)
336
336
  return _ModuleFeatureExtractor(
337
337
  torch.nn.Sequential(
338
338
  torchaudio.transforms.MelSpectrogram(
@@ -350,7 +350,7 @@ class RNNTBundle:
350
350
  Returns:
351
351
  TokenProcessor
352
352
  """
353
- local_path = torchaudio.utils.download_asset(self._sp_model_path)
353
+ local_path = torchaudio.utils._download_asset(self._sp_model_path)
354
354
  return _SentencePieceTokenProcessor(local_path)
355
355
 
356
356
 
@@ -1,3 +1,5 @@
1
+ from torchaudio._internal.module_utils import dropping_class_support
2
+
1
3
  from ._multi_channel import MVDR, PSD, RTFMVDR, SoudenMVDR
2
4
  from ._transforms import (
3
5
  AddNoise,
@@ -21,7 +23,7 @@ from ._transforms import (
21
23
  PitchShift,
22
24
  Preemphasis,
23
25
  Resample,
24
- RNNTLoss,
26
+ RNNTLoss as _RNNTLoss,
25
27
  SlidingWindowCmn,
26
28
  SpecAugment,
27
29
  SpectralCentroid,
@@ -34,6 +36,7 @@ from ._transforms import (
34
36
  Vol,
35
37
  )
36
38
 
39
+ RNNTLoss = dropping_class_support(_RNNTLoss)
37
40
 
38
41
  __all__ = [
39
42
  "AddNoise",
@@ -15,6 +15,7 @@ from torchaudio.functional.functional import (
15
15
  _check_convolve_mode,
16
16
  _fix_waveform_shape,
17
17
  _get_sinc_resample_kernel,
18
+ _rnnt_loss,
18
19
  _stretch_waveform,
19
20
  )
20
21
 
@@ -1184,7 +1185,7 @@ class _AxisMasking(torch.nn.Module):
1184
1185
  self.iid_masks = iid_masks
1185
1186
  self.p = p
1186
1187
 
1187
- def forward(self, specgram: Tensor, mask_value: float = 0.0) -> Tensor:
1188
+ def forward(self, specgram: Tensor, mask_value: Union[float, torch.Tensor] = 0.0) -> Tensor:
1188
1189
  r"""
1189
1190
  Args:
1190
1191
  specgram (Tensor): Tensor of dimension `(..., freq, time)`.
@@ -1846,7 +1847,7 @@ class RNNTLoss(torch.nn.Module):
1846
1847
  Tensor: Loss with the reduction option applied. If ``reduction`` is ``"none"``, then size (batch),
1847
1848
  otherwise scalar.
1848
1849
  """
1849
- return F.rnnt_loss(
1850
+ return _rnnt_loss(
1850
1851
  logits,
1851
1852
  targets,
1852
1853
  logit_lengths,
@@ -2134,4 +2135,4 @@ class Deemphasis(torch.nn.Module):
2134
2135
  Returns:
2135
2136
  torch.Tensor: De-emphasized waveform, with shape `(..., N)`.
2136
2137
  """
2137
- return F.deemphasis(waveform, coeff=self.coeff)
2138
+ return F.functional.deemphasis(waveform, coeff=self.coeff)
@@ -1,11 +1,4 @@
1
- from torio.utils import ffmpeg_utils
1
+ from .download import _download_asset
2
2
 
3
- from . import sox_utils
4
- from .download import download_asset
5
3
 
6
-
7
- __all__ = [
8
- "download_asset",
9
- "sox_utils",
10
- "ffmpeg_utils",
11
- ]
4
+ __all__ = ["_download_asset"]
@@ -31,7 +31,7 @@ def _get_hash(path, hash, chunk_size=1028):
31
31
  return m.hexdigest()
32
32
 
33
33
 
34
- def download_asset(
34
+ def _download_asset(
35
35
  key: str,
36
36
  hash: str = "",
37
37
  path: Union[str, PathLike] = "",
torchaudio/version.py CHANGED
@@ -1,2 +1,2 @@
1
- __version__ = '2.7.1+cpu'
2
- git_version = '95c61b4168fc5133be8dd8c1337d929d066ae6cf'
1
+ __version__ = '2.9.0+cpu'
2
+ git_version = 'eaa9e4e4dd413dca1084116581dc84fad403db3b'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: torchaudio
3
- Version: 2.7.1
3
+ Version: 2.9.0
4
4
  Summary: An audio package for PyTorch
5
5
  Home-page: https://github.com/pytorch/audio
6
6
  Author: Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang
@@ -15,17 +15,17 @@ Classifier: Operating System :: MacOS :: MacOS X
15
15
  Classifier: Operating System :: Microsoft :: Windows
16
16
  Classifier: Operating System :: POSIX
17
17
  Classifier: Programming Language :: C++
18
- Classifier: Programming Language :: Python :: 3.9
19
18
  Classifier: Programming Language :: Python :: 3.10
20
19
  Classifier: Programming Language :: Python :: 3.11
21
20
  Classifier: Programming Language :: Python :: 3.12
22
21
  Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Programming Language :: Python :: 3.14
23
23
  Classifier: Programming Language :: Python :: Implementation :: CPython
24
24
  Classifier: Topic :: Multimedia :: Sound/Audio
25
25
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
26
26
  Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
- Requires-Dist: torch ==2.7.1
28
+ Requires-Dist: torch ==2.9.0
29
29
 
30
30
  torchaudio: an audio library for PyTorch
31
31
  ========================================
@@ -36,6 +36,17 @@ torchaudio: an audio library for PyTorch
36
36
 
37
37
  ![TorchAudio Logo](docs/source/_static/img/logo.png)
38
38
 
39
+ > [!NOTE]
40
+ > **We have transitioned TorchAudio into a
41
+ > maintenance phase. This process removed some user-facing
42
+ > features. These features were deprecated from TorchAudio 2.8 and removed in 2.9.
43
+ > Our main goals were to reduce redundancies with the rest of the
44
+ > PyTorch ecosystem, make it easier to maintain, and create a version of
45
+ > TorchAudio that is more tightly scoped to its strengths: processing audio
46
+ > data for ML. Please see
47
+ > [our community message](https://github.com/pytorch/audio/issues/3902)
48
+ > for more details.**
49
+
39
50
  The aim of torchaudio is to apply [PyTorch](https://github.com/pytorch/pytorch) to
40
51
  the audio domain. By supporting PyTorch, torchaudio follows the same philosophy
41
52
  of providing strong GPU acceleration, having a focus on trainable features through
@@ -45,9 +56,6 @@ processing library. The benefits of PyTorch can be seen in torchaudio through
45
56
  having all the computations be through PyTorch operations which makes it easy
46
57
  to use and feel like a natural extension.
47
58
 
48
- - [Support audio I/O (Load files, Save files)](http://pytorch.org/audio/main/)
49
- - Load a variety of audio formats, such as `wav`, `mp3`, `ogg`, `flac`, `opus`, `sphere`, into a torch Tensor using SoX
50
- - [Kaldi (ark/scp)](http://pytorch.org/audio/main/kaldi_io.html)
51
59
  - [Dataloaders for common audio datasets](http://pytorch.org/audio/main/datasets.html)
52
60
  - Audio and speech processing functions
53
61
  - [forced_align](https://pytorch.org/audio/main/generated/torchaudio.functional.forced_align.html)
@@ -88,7 +96,7 @@ If you find this package useful, please cite as:
88
96
 
89
97
  ```bibtex
90
98
  @misc{hwang2023torchaudio,
91
- title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch},
99
+ title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch},
92
100
  author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao and Robin Scheibler and Samuele Cornell and Sean Kim and Stavros Petridis},
93
101
  year={2023},
94
102
  eprint={2310.17864},
@@ -0,0 +1,85 @@
1
+ torchaudio/__init__.py,sha256=-yWZZVblWA06HQ4cUS8sRsaSZugqacqw0A7vQELxjYE,8082
2
+ torchaudio/_torchcodec.py,sha256=2saifA0BdhE12Zb51vgS6zrwsE1ir7mmvmJ1lFOWtLI,13764
3
+ torchaudio/version.py,sha256=UZrgLp-AqA1uCSaLgWb8hq-wCqJ8Pz6Pe5opbKJNYKQ,85
4
+ torchaudio/_extension/__init__.py,sha256=j7wdZTgwGv6PcQgS1kMisbDA-M4emX3gheOSmjq_jWs,1966
5
+ torchaudio/_extension/utils.py,sha256=kQ_PyLToNuPjLKOQa_-tT1LpFowcGQ0lpcuzrRPrmb8,5059
6
+ torchaudio/_internal/__init__.py,sha256=80cpJfTS8977YYrU3q5p4DRAGAkqEJrmG9Lq2hEDpoo,251
7
+ torchaudio/_internal/module_utils.py,sha256=sXO16_5rS9c67LlADALR16k3HcZo9dHyZ-y_L0zFnnY,5400
8
+ torchaudio/compliance/__init__.py,sha256=JNH_-dTQVmm55YwcVMuVvUYFWdXhGn4C__9S8IUsNoU,53
9
+ torchaudio/compliance/kaldi.py,sha256=bS7qJgS3k8FK1RkMiNEoP3q0xhjeV_V4RHQ9jo_rqOM,37479
10
+ torchaudio/datasets/__init__.py,sha256=hdHldm3OzoQLbI0kHj8tLxqwDhzMfedq0_t1kAK7ORg,1218
11
+ torchaudio/datasets/cmuarctic.py,sha256=c7c75817_brmb7cvFO6_Bj249cJDph9LDBOqs8aUyhM,7238
12
+ torchaudio/datasets/cmudict.py,sha256=_9vTz7_8BFVrcHeA61_-h2XLOl6IsdWCptkMWziOW7U,6176
13
+ torchaudio/datasets/commonvoice.py,sha256=OcFn-nG4YfBIz0YIpH91xH9rFka8yFJmrxy4vFZkC4I,2849
14
+ torchaudio/datasets/dr_vctk.py,sha256=Ayf85prDNr1LcWQ4bysVWdRVPry2JALjv6Mtq-6iBpY,4498
15
+ torchaudio/datasets/fluentcommands.py,sha256=KnmH1Y28k5PhqQX6eV-75MqwTRxiHSUUcvAsa-K954s,3353
16
+ torchaudio/datasets/gtzan.py,sha256=kt25Ly9qDGuiiVXgsXhS05tGi6laRhRko81-BQ4sZ-w,25475
17
+ torchaudio/datasets/iemocap.py,sha256=ZMMG_FpcWcMHEbhuRYRQaUWi_DoegjxCrnVyCg5EEVE,5077
18
+ torchaudio/datasets/librilight_limited.py,sha256=iwZBlSKVLrXzhZvaqjuVRGO6czxX4fpdzd8wWe5feWQ,4290
19
+ torchaudio/datasets/librimix.py,sha256=AncE671AOl04dRPsajNZW-ZxxI_PwA2sjBftdBg4Q-k,5249
20
+ torchaudio/datasets/librispeech.py,sha256=ys769I0UzG07UEmyZ_KDwATh4yc08hFUuCayK8tYIGg,6482
21
+ torchaudio/datasets/librispeech_biasing.py,sha256=KEGplRU_wpgb0VqrT-t42kvtC7lg4uMssZcosVvvPhg,7147
22
+ torchaudio/datasets/libritts.py,sha256=91Ep2Mq3OySre25GniXBLmRzTwEPiKmMaqXnzirn0xY,6038
23
+ torchaudio/datasets/ljspeech.py,sha256=l09BSBQH76I-LhYkIRF0u18tTi-4yysaF4gj2GSZaxw,3601
24
+ torchaudio/datasets/musdb_hq.py,sha256=FVlKsGEBHiT50y9GLswnt2QFph2PjiI6yCy1MxiG6f8,5214
25
+ torchaudio/datasets/quesst14.py,sha256=3y6H3T3g78jkDqca8jORQBOViZhH1RhlsfuY8HJ2OcU,4591
26
+ torchaudio/datasets/snips.py,sha256=mwVc5KsbMlPQJ87eyYgjnQ5S4EFXoQvm13dO0rXpJuE,5165
27
+ torchaudio/datasets/speechcommands.py,sha256=_wmrKSiEe0COO7uk0JVXypBmNxu0urnceHuFQ6zMOk0,7664
28
+ torchaudio/datasets/tedlium.py,sha256=UQZUaeUqmFntZWcH9HXOpGeW6tsCcG81bPjX2_CWxbg,8916
29
+ torchaudio/datasets/utils.py,sha256=mpg4t0hFitRGj9Ow7MXwCFNKGTnVsErVLpxfsbP7FE8,1757
30
+ torchaudio/datasets/vctk.py,sha256=vN_VzxTLyHW11I_rzfzMVA3h5JW917FaU3NCnR-zcL0,5842
31
+ torchaudio/datasets/voxceleb1.py,sha256=JlYkbyYOAFUFhGLULe3lgucANWf_G7qGqw47YjiX2IM,12034
32
+ torchaudio/datasets/yesno.py,sha256=B3hRNUazvB8V8SwOUlQzliB9vI9gMkl9SEl-dZ4PEaw,3115
33
+ torchaudio/functional/__init__.py,sha256=do2OUOUhg_8Z7TPUQ1HHpoWjNAPrwgxDIemk718TWO0,2581
34
+ torchaudio/functional/_alignment.py,sha256=P2ehTZ7IwuMFWVNqrhYjc1imBKNykwC03D7uvbgxBCA,4867
35
+ torchaudio/functional/filtering.py,sha256=piUbVknBOBdILrd1M9bzk2A9UCCn4qzKXNEgv7IYD7Q,64010
36
+ torchaudio/functional/functional.py,sha256=c-jSGnLx54qnQk5efZiLrioi5x1-2LIQF3P2lvLPPPo,97236
37
+ torchaudio/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
+ torchaudio/lib/_torchaudio.pyd,sha256=sIYp4x999XQ1PyUMJfKZOGpHiH0rI72SplyXUXg4Dm8,847872
39
+ torchaudio/lib/libtorchaudio.pyd,sha256=7OPdufJqCxLLcL8Tud6bnSdGyjItW7IEi8kM6N3DMWw,923648
40
+ torchaudio/models/__init__.py,sha256=Gi3UQvxjwTLW9wfKlF42O3Vup70d0bk2x-rZS89ASwI,2080
41
+ torchaudio/models/_hdemucs.py,sha256=ipAj7965PO_WEZqQwW1om9gQj90UhQOeU6HU3Lpvzwo,39250
42
+ torchaudio/models/conformer.py,sha256=gVrOYeJkPlVaX-4eZpVzNUe_r3k7g1Y6NaaQ8JZP-r4,10361
43
+ torchaudio/models/conv_tasnet.py,sha256=D7Y10sOzLe03gygfN1J5R73SIHkIGVQOkqKQ6Ni3o_s,12870
44
+ torchaudio/models/deepspeech.py,sha256=nVYc2xwWpFO6gu5CR0mbqLiAzJn8lAfHcdcP92i22mo,2830
45
+ torchaudio/models/emformer.py,sha256=WbaeZcrPFOOLn4igqweE0AfuF_SQZpqg7XPGEhl7C8c,38650
46
+ torchaudio/models/rnnt.py,sha256=PNJpZd3vH6wRq8TEf4UlPtVHbte9wOJ-bRMEug6gp08,36357
47
+ torchaudio/models/rnnt_decoder.py,sha256=CBBMZhhq5Bgax0_3p3SZD-Os3S1LFHB91oTgVED4bmY,13178
48
+ torchaudio/models/tacotron2.py,sha256=mZ5lLSa75oqc0hgkc3sIm5_gK-knhtgX3dmg9-oLQao,46960
49
+ torchaudio/models/wav2letter.py,sha256=oetxpH5RG0TadYB75IOmYOrnraaPvSlcSNpRZb2FE_A,3350
50
+ torchaudio/models/wavernn.py,sha256=LRgL36jA6WzI1PAzBY6P52oCMGSTOraXB8fEgkwpSxw,15855
51
+ torchaudio/models/decoder/__init__.py,sha256=PonG1Rg0CRBBbmRLZZQ1n2rXiDhivAAU9x67_G15seI,1963
52
+ torchaudio/models/decoder/_ctc_decoder.py,sha256=zKsOdPNrUn7v2QJmluC3kOp90RQaP3CSmQSurc1nAFw,20654
53
+ torchaudio/models/decoder/_cuda_ctc_decoder.py,sha256=4JKcQak4Ke6Id0EJEDJEx1yLTXKbJpIDNiu7QSe3gWU,7373
54
+ torchaudio/models/squim/__init__.py,sha256=eQox8kPviOthKulpzZvPK0a66NHW7MzYE4aOF7va_kU,357
55
+ torchaudio/models/squim/objective.py,sha256=FCYu0i2OXY3e6Z-BO2p-rc6rU0PvpJZ0gA-CPZZA9fw,12607
56
+ torchaudio/models/squim/subjective.py,sha256=1_gK9O3nvrjiikpP46IdsMzKduSTt91kKklA69wQqiw,5947
57
+ torchaudio/models/wav2vec2/__init__.py,sha256=j5FdQFfuIpdIKYwoMLop4Ba70GGoS-lK61tU-oNG5wg,972
58
+ torchaudio/models/wav2vec2/components.py,sha256=EzmuGc5qHVPrHCGqYVHTvdjqP2gCrBfnHSoTK9GsZ1w,48244
59
+ torchaudio/models/wav2vec2/model.py,sha256=kP6QKsF1PjleyUMhaPjydi0pCRy4GGUArRWBzfDJmdE,61671
60
+ torchaudio/models/wav2vec2/wavlm_attention.py,sha256=iYde9grsb_RaEs87FI5ykyN3z0Ix1plqpsMNvakAiWM,11058
61
+ torchaudio/models/wav2vec2/utils/__init__.py,sha256=1eowaOEKRbp7JajFNv_r47REJqnMmXidukS7Mrwp_5Q,188
62
+ torchaudio/models/wav2vec2/utils/import_fairseq.py,sha256=so7T-otDNCsTUtzJRUFFGWyd0caWl3RY_UbFMxJ4DJE,9411
63
+ torchaudio/models/wav2vec2/utils/import_huggingface.py,sha256=NMK6YrAIDfOw8j1tV-3XTwx_mwbJHvg8ldTrAWRztIM,6080
64
+ torchaudio/pipelines/__init__.py,sha256=oMwOu-1T_ugJmhdaoI5NrCDrUAGrpDOlJQO8h-bLAW4,2847
65
+ torchaudio/pipelines/_source_separation_pipeline.py,sha256=ttHqjcwCmCPWLj0YeDsTa1-XetuyjPDZ9D2deE3FmkA,4334
66
+ torchaudio/pipelines/_squim_pipeline.py,sha256=eYdrKVXUru3VdfpaDnMN5qCuKHNveEd_jwGqtemV9ls,6438
67
+ torchaudio/pipelines/rnnt_pipeline.py,sha256=16OMN_4yY1TEKLWjqkzFSMKByITxLobj6X1uk78pwQI,14133
68
+ torchaudio/pipelines/_tts/__init__.py,sha256=WKc5c06b_M9MvEohJZghJJWAL7vXvfwRIkdy85UCh04,442
69
+ torchaudio/pipelines/_tts/impl.py,sha256=wwrTyTEEkew22AnzB_ZklapGaAstJSUBawhA7bOcGXM,15759
70
+ torchaudio/pipelines/_tts/interface.py,sha256=y1mU0446Vy2hHpCwMqRZt1UI4ZXl-C4tJp92EylwHh0,10479
71
+ torchaudio/pipelines/_tts/utils.py,sha256=tuiEA5eqoBNgt46TxGA7lOEqljbuECL0-pc_uSco0xo,5040
72
+ torchaudio/pipelines/_wav2vec2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
+ torchaudio/pipelines/_wav2vec2/aligner.py,sha256=HOcthFgup97QMx9ZXCmkv6jdw_zxdRT-e_SilXEujNU,2796
74
+ torchaudio/pipelines/_wav2vec2/impl.py,sha256=I6htNo4Wt5LPxX9Z8rmxarFE8BZOZBUFIU9T9k1k2Po,67260
75
+ torchaudio/pipelines/_wav2vec2/utils.py,sha256=CVawfXmVGWY8mj-_6r4KO907BpF67WAVWHEHhycFIaM,7317
76
+ torchaudio/transforms/__init__.py,sha256=TsmUD7pXQO940uG0GhFTuMB48PT6uOklN5ptd-Yut14,1476
77
+ torchaudio/transforms/_multi_channel.py,sha256=Musw7dTu25HNjKeIcKHUDuqBmj_GC2e3TaakqJcffW8,22688
78
+ torchaudio/transforms/_transforms.py,sha256=g-E3nGgCEcKeWqEtyrDquSKfecHMD8olJRUMnqHHWYI,89057
79
+ torchaudio/utils/__init__.py,sha256=yNMWIjoGd68FPxV6PhDdjO1oRemlM0QPJsu_k6iVaGQ,74
80
+ torchaudio/utils/download.py,sha256=rf_yS18i7n4JYbIGpWiWc0ipe4sGv3Rvivv6p0DaZgU,2972
81
+ torchaudio-2.9.0.dist-info/LICENSE,sha256=MmOOF5kxv-VR6r9nsOZ6E7SD4Wa1jdcmNjSrf4nzlvU,1363
82
+ torchaudio-2.9.0.dist-info/METADATA,sha256=ygAM9_HG0s1iYrA3X6SLUDTX-QkiDZm6H91eEFlafgI,6672
83
+ torchaudio-2.9.0.dist-info/WHEEL,sha256=zTUm9EcnLPhoh5JJi6dtAMNB3fpxN5e25DClaOiiBcE,102
84
+ torchaudio-2.9.0.dist-info/top_level.txt,sha256=mPKWMIRWWW2JwbJN6wRckeN1gpbjhifapAF0Z9t7SMo,11
85
+ torchaudio-2.9.0.dist-info/RECORD,,
@@ -1,61 +0,0 @@
1
- from typing import List, Optional
2
-
3
- from torchaudio._internal.module_utils import deprecated
4
-
5
- from . import utils
6
- from .common import AudioMetaData
7
-
8
- __all__ = [
9
- "AudioMetaData",
10
- "load",
11
- "info",
12
- "save",
13
- "list_audio_backends",
14
- "get_audio_backend",
15
- "set_audio_backend",
16
- ]
17
-
18
-
19
- info = utils.get_info_func()
20
- load = utils.get_load_func()
21
- save = utils.get_save_func()
22
-
23
-
24
- def list_audio_backends() -> List[str]:
25
- """List available backends
26
-
27
- Returns:
28
- list of str: The list of available backends.
29
-
30
- The possible values are; ``"ffmpeg"``, ``"sox"`` and ``"soundfile"``.
31
- """
32
-
33
- return list(utils.get_available_backends().keys())
34
-
35
-
36
- # Temporary until global backend is removed
37
- @deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
38
- def get_audio_backend() -> Optional[str]:
39
- """Get the name of the current global backend
40
-
41
- Returns:
42
- str or None:
43
- If dispatcher mode is enabled, returns ``None`` otherwise,
44
- the name of current backend or ``None`` (no backend is set).
45
- """
46
- return None
47
-
48
-
49
- # Temporary until global backend is removed
50
- @deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
51
- def set_audio_backend(backend: Optional[str]): # noqa
52
- """Set the global backend.
53
-
54
- This is a no-op when dispatcher mode is enabled.
55
-
56
- Args:
57
- backend (str or None): Name of the backend.
58
- One of ``"sox_io"`` or ``"soundfile"`` based on availability
59
- of the system. If ``None`` is provided the current backend is unassigned.
60
- """
61
- pass