torchaudio 2.8.0__cp313-cp313t-win_amd64.whl → 2.9.0__cp313-cp313t-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show
  1. torchaudio/__init__.py +179 -39
  2. torchaudio/_extension/__init__.py +1 -14
  3. torchaudio/_extension/utils.py +0 -47
  4. torchaudio/_internal/module_utils.py +12 -3
  5. torchaudio/_torchcodec.py +73 -85
  6. torchaudio/datasets/cmuarctic.py +1 -1
  7. torchaudio/datasets/utils.py +1 -1
  8. torchaudio/functional/__init__.py +0 -2
  9. torchaudio/functional/_alignment.py +1 -1
  10. torchaudio/functional/filtering.py +70 -55
  11. torchaudio/functional/functional.py +26 -60
  12. torchaudio/lib/_torchaudio.pyd +0 -0
  13. torchaudio/lib/libtorchaudio.pyd +0 -0
  14. torchaudio/models/decoder/__init__.py +14 -2
  15. torchaudio/models/decoder/_ctc_decoder.py +6 -6
  16. torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
  17. torchaudio/models/squim/objective.py +2 -2
  18. torchaudio/pipelines/_source_separation_pipeline.py +1 -1
  19. torchaudio/pipelines/_squim_pipeline.py +2 -2
  20. torchaudio/pipelines/_tts/utils.py +1 -1
  21. torchaudio/pipelines/rnnt_pipeline.py +4 -4
  22. torchaudio/transforms/__init__.py +1 -0
  23. torchaudio/transforms/_transforms.py +2 -2
  24. torchaudio/utils/__init__.py +2 -9
  25. torchaudio/utils/download.py +1 -3
  26. torchaudio/version.py +2 -2
  27. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/METADATA +8 -11
  28. torchaudio-2.9.0.dist-info/RECORD +85 -0
  29. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
  30. torchaudio/_backend/__init__.py +0 -61
  31. torchaudio/_backend/backend.py +0 -53
  32. torchaudio/_backend/common.py +0 -52
  33. torchaudio/_backend/ffmpeg.py +0 -334
  34. torchaudio/_backend/soundfile.py +0 -54
  35. torchaudio/_backend/soundfile_backend.py +0 -457
  36. torchaudio/_backend/sox.py +0 -91
  37. torchaudio/_backend/utils.py +0 -350
  38. torchaudio/backend/__init__.py +0 -8
  39. torchaudio/backend/_no_backend.py +0 -25
  40. torchaudio/backend/_sox_io_backend.py +0 -294
  41. torchaudio/backend/common.py +0 -13
  42. torchaudio/backend/no_backend.py +0 -14
  43. torchaudio/backend/soundfile_backend.py +0 -14
  44. torchaudio/backend/sox_io_backend.py +0 -14
  45. torchaudio/io/__init__.py +0 -20
  46. torchaudio/io/_effector.py +0 -347
  47. torchaudio/io/_playback.py +0 -72
  48. torchaudio/kaldi_io.py +0 -150
  49. torchaudio/prototype/__init__.py +0 -0
  50. torchaudio/prototype/datasets/__init__.py +0 -4
  51. torchaudio/prototype/datasets/musan.py +0 -68
  52. torchaudio/prototype/functional/__init__.py +0 -26
  53. torchaudio/prototype/functional/_dsp.py +0 -441
  54. torchaudio/prototype/functional/_rir.py +0 -382
  55. torchaudio/prototype/functional/functional.py +0 -193
  56. torchaudio/prototype/models/__init__.py +0 -39
  57. torchaudio/prototype/models/_conformer_wav2vec2.py +0 -801
  58. torchaudio/prototype/models/_emformer_hubert.py +0 -337
  59. torchaudio/prototype/models/conv_emformer.py +0 -529
  60. torchaudio/prototype/models/hifi_gan.py +0 -342
  61. torchaudio/prototype/models/rnnt.py +0 -717
  62. torchaudio/prototype/models/rnnt_decoder.py +0 -402
  63. torchaudio/prototype/pipelines/__init__.py +0 -21
  64. torchaudio/prototype/pipelines/_vggish/__init__.py +0 -7
  65. torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -236
  66. torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -83
  67. torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -233
  68. torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
  69. torchaudio/prototype/transforms/__init__.py +0 -9
  70. torchaudio/prototype/transforms/_transforms.py +0 -461
  71. torchaudio/sox_effects/__init__.py +0 -10
  72. torchaudio/sox_effects/sox_effects.py +0 -275
  73. torchaudio/utils/ffmpeg_utils.py +0 -11
  74. torchaudio/utils/sox_utils.py +0 -118
  75. torchaudio-2.8.0.dist-info/RECORD +0 -145
  76. torio/__init__.py +0 -8
  77. torio/_extension/__init__.py +0 -13
  78. torio/_extension/utils.py +0 -147
  79. torio/io/__init__.py +0 -9
  80. torio/io/_streaming_media_decoder.py +0 -977
  81. torio/io/_streaming_media_encoder.py +0 -502
  82. torio/lib/__init__.py +0 -0
  83. torio/lib/_torio_ffmpeg4.pyd +0 -0
  84. torio/lib/_torio_ffmpeg5.pyd +0 -0
  85. torio/lib/_torio_ffmpeg6.pyd +0 -0
  86. torio/lib/libtorio_ffmpeg4.pyd +0 -0
  87. torio/lib/libtorio_ffmpeg5.pyd +0 -0
  88. torio/lib/libtorio_ffmpeg6.pyd +0 -0
  89. torio/utils/__init__.py +0 -4
  90. torio/utils/ffmpeg_utils.py +0 -275
  91. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/LICENSE +0 -0
  92. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
@@ -1,83 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Callable, Dict
3
-
4
- from torchaudio._internal.module_utils import dropping_class_support
5
-
6
-
7
- from ._vggish_impl import _SAMPLE_RATE, VGGish as _VGGish, VGGishInputProcessor as _VGGishInputProcessor
8
-
9
-
10
- def _get_state_dict():
11
- path = torchaudio.utils.download_asset("models/vggish.pt")
12
- return torch.load(path)
13
-
14
-
15
- @dropping_class_support
16
- @dataclass
17
- class VGGishBundle:
18
- """VGGish :cite:`45611` inference pipeline ported from
19
- `torchvggish <https://github.com/harritaylor/torchvggish>`__
20
- and `tensorflow-models <https://github.com/tensorflow/models/tree/master/research/audioset>`__.
21
-
22
- Example:
23
- >>> import torchaudio
24
- >>> from torchaudio.prototype.pipelines import VGGISH
25
- >>>
26
- >>> input_sr = VGGISH.sample_rate
27
- >>> input_proc = VGGISH.get_input_processor()
28
- >>> model = VGGISH.get_model()
29
- >>>
30
- >>> waveform, sr = torchaudio.load(
31
- >>> "Chopin_Ballade_-1_In_G_Minor,_Op._23.mp3",
32
- >>> )
33
- >>> waveform = waveform.squeeze(0)
34
- >>> waveform = torchaudio.functional.resample(waveform, sr, input_sr)
35
- >>> mono_output = model(input_proc(waveform))
36
- """
37
-
38
- class VGGish(_VGGish):
39
- __doc__ = _VGGish.__doc__
40
-
41
- class VGGishInputProcessor(_VGGishInputProcessor):
42
- __doc__ = _VGGishInputProcessor.__doc__
43
-
44
- _state_dict_func: Callable[[], Dict]
45
-
46
- @property
47
- def sample_rate(self) -> int:
48
- """Sample rate of input waveform expected by input processor and model.
49
-
50
- :type: int
51
- """
52
- return _SAMPLE_RATE
53
-
54
- def get_model(self) -> VGGish:
55
- """Constructs pre-trained VGGish model. Downloads and caches weights as necessary.
56
-
57
- Returns:
58
- VGGish: VGGish model with pre-trained weights loaded.
59
- """
60
- model = self.VGGish()
61
- state_dict = self._state_dict_func()
62
- model.load_state_dict(state_dict)
63
- model.eval()
64
- return model
65
-
66
- def get_input_processor(self) -> VGGishInputProcessor:
67
- """Constructs input processor for VGGish.
68
-
69
- Returns:
70
- VGGishInputProcessor: input processor for VGGish.
71
- """
72
- return self.VGGishInputProcessor()
73
-
74
-
75
- VGGISH = VGGishBundle(_get_state_dict)
76
- VGGISH.__doc__ = """Pre-trained VGGish :cite:`45611` inference pipeline ported from
77
- `torchvggish <https://github.com/harritaylor/torchvggish>`__
78
- and `tensorflow-models <https://github.com/tensorflow/models/tree/master/research/audioset>`__.
79
-
80
- Per the `documentation <https://github.com/tensorflow/models/tree/master/research/audioset/vggish>`__
81
- for the original model, the model is "trained on a large YouTube dataset (a preliminary version of
82
- what later became YouTube-8M)".
83
- """
@@ -1,233 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Any, Dict, Optional
3
-
4
- import torch
5
- import torch.nn.functional as F
6
- from torch.nn import Module
7
- from torchaudio._internal import load_state_dict_from_url
8
-
9
- from torchaudio.prototype.models.hifi_gan import hifigan_vocoder, HiFiGANVocoder
10
- from torchaudio.transforms import MelSpectrogram
11
-
12
- from torchaudio._internal.module_utils import dropping_support, dropping_class_support
13
-
14
-
15
- @dropping_class_support
16
- @dataclass
17
- class HiFiGANVocoderBundle:
18
- """Data class that bundles associated information to use pretrained
19
- :py:class:`~torchaudio.prototype.models.HiFiGANVocoder`.
20
-
21
- This class provides interfaces for instantiating the pretrained model along with
22
- the information necessary to retrieve pretrained weights and additional data
23
- to be used with the model.
24
-
25
- Torchaudio library instantiates objects of this class, each of which represents
26
- a different pretrained model. Client code should access pretrained models via these
27
- instances.
28
-
29
- This bundle can convert mel spectrorgam to waveforms and vice versa. A typical use case would be a flow like
30
- `text -> mel spectrogram -> waveform`, where one can use an external component, e.g. Tacotron2,
31
- to generate mel spectrogram from text. Please see below for the code example.
32
-
33
- Example: Transform synthetic mel spectrogram to audio.
34
- >>> import torch
35
- >>> import torchaudio
36
- >>> # Since HiFiGAN bundle is in prototypes, it needs to be exported explicitly
37
- >>> from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle
38
- >>>
39
- >>> # Load the HiFiGAN bundle
40
- >>> vocoder = bundle.get_vocoder()
41
- Downloading: "https://download.pytorch.org/torchaudio/models/hifigan_vocoder_v3_ljspeech.pth"
42
- 100%|████████████| 5.59M/5.59M [00:00<00:00, 18.7MB/s]
43
- >>>
44
- >>> # Generate synthetic mel spectrogram
45
- >>> specgram = torch.sin(0.5 * torch.arange(start=0, end=100)).expand(bundle._vocoder_params["in_channels"], 100)
46
- >>>
47
- >>> # Transform mel spectrogram into audio
48
- >>> waveform = vocoder(specgram)
49
- >>> torchaudio.save('sample.wav', waveform, bundle.sample_rate)
50
-
51
- Example: Usage together with Tacotron2, text to audio.
52
- >>> import torch
53
- >>> import torchaudio
54
- >>> # Since HiFiGAN bundle is in prototypes, it needs to be exported explicitly
55
- >>> from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle_hifigan
56
- >>>
57
- >>> # Load Tacotron2 bundle
58
- >>> bundle_tactron2 = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
59
- >>> processor = bundle_tactron2.get_text_processor()
60
- >>> tacotron2 = bundle_tactron2.get_tacotron2()
61
- >>>
62
- >>> # Use Tacotron2 to convert text to mel spectrogram
63
- >>> text = "A quick brown fox jumped over a lazy dog"
64
- >>> input, lengths = processor(text)
65
- >>> specgram, lengths, _ = tacotron2.infer(input, lengths)
66
- >>>
67
- >>> # Load HiFiGAN bundle
68
- >>> vocoder = bundle_hifigan.get_vocoder()
69
- Downloading: "https://download.pytorch.org/torchaudio/models/hifigan_vocoder_v3_ljspeech.pth"
70
- 100%|████████████| 5.59M/5.59M [00:03<00:00, 1.55MB/s]
71
- >>>
72
- >>> # Use HiFiGAN to convert mel spectrogram to audio
73
- >>> waveform = vocoder(specgram).squeeze(0)
74
- >>> torchaudio.save('sample.wav', waveform, bundle_hifigan.sample_rate)
75
- """ # noqa: E501
76
-
77
- _path: str
78
- _vocoder_params: Dict[str, Any] # Vocoder parameters
79
- _mel_params: Dict[str, Any] # Mel transformation parameters
80
- _sample_rate: float
81
-
82
- def _get_state_dict(self, dl_kwargs):
83
- url = f"https://download.pytorch.org/torchaudio/models/{self._path}"
84
- dl_kwargs = {} if dl_kwargs is None else dl_kwargs
85
- state_dict = load_state_dict_from_url(url, **dl_kwargs)
86
- return state_dict
87
-
88
- @dropping_support
89
- def get_vocoder(self, *, dl_kwargs=None) -> HiFiGANVocoder:
90
- """Construct the HiFiGAN Generator model, which can be used a vocoder, and load the pretrained weight.
91
-
92
- The weight file is downloaded from the internet and cached with
93
- :func:`torch.hub.load_state_dict_from_url`
94
-
95
- Args:
96
- dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
97
-
98
- Returns:
99
- Variation of :py:class:`~torchaudio.prototype.models.HiFiGANVocoder`.
100
- """
101
- model = hifigan_vocoder(**self._vocoder_params)
102
- model.load_state_dict(self._get_state_dict(dl_kwargs))
103
- model.eval()
104
- return model
105
-
106
- @dropping_support
107
- def get_mel_transform(self) -> Module:
108
- """Construct an object which transforms waveforms into mel spectrograms."""
109
- return _HiFiGANMelSpectrogram(
110
- n_mels=self._vocoder_params["in_channels"],
111
- sample_rate=self._sample_rate,
112
- **self._mel_params,
113
- )
114
-
115
- @property
116
- def sample_rate(self):
117
- """Sample rate of the audio that the model is trained on.
118
-
119
- :type: float
120
- """
121
- return self._sample_rate
122
-
123
-
124
- class _HiFiGANMelSpectrogram(torch.nn.Module):
125
- """
126
- Generate mel spectrogram in a way equivalent to the original HiFiGAN implementation:
127
- https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/meldataset.py#L49-L72
128
-
129
- This class wraps around :py:class:`torchaudio.transforms.MelSpectrogram`, but performs extra steps to achive
130
- equivalence with the HiFiGAN implementation.
131
-
132
- Args:
133
- hop_size (int): Length of hop between STFT windows.
134
- n_fft (int): Size of FFT, creates ``n_fft // 2 + 1`` bins.
135
- win_length (int): Window size.
136
- f_min (float or None): Minimum frequency.
137
- f_max (float or None): Maximum frequency.
138
- sample_rate (int): Sample rate of audio signal.
139
- n_mels (int): Number of mel filterbanks.
140
- """
141
-
142
- def __init__(
143
- self,
144
- hop_size: int,
145
- n_fft: int,
146
- win_length: int,
147
- f_min: Optional[float],
148
- f_max: Optional[float],
149
- sample_rate: float,
150
- n_mels: int,
151
- ):
152
- super(_HiFiGANMelSpectrogram, self).__init__()
153
- self.mel_transform = MelSpectrogram(
154
- sample_rate=sample_rate,
155
- n_fft=n_fft,
156
- win_length=win_length,
157
- hop_length=hop_size,
158
- f_min=f_min,
159
- f_max=f_max,
160
- n_mels=n_mels,
161
- normalized=False,
162
- pad=0,
163
- mel_scale="slaney",
164
- norm="slaney",
165
- center=False,
166
- )
167
- self.sample_rate = sample_rate
168
- self.hop_size = hop_size
169
- self.n_fft = n_fft
170
- self.win_length = win_length
171
- self.f_min = f_min
172
- self.f_max = f_max
173
- self.n_mels = n_mels
174
- self.pad_size = int((n_fft - hop_size) / 2)
175
-
176
- def forward(self, waveform: torch.Tensor) -> torch.Tensor:
177
- """Generate mel spectrogram from a waveform. Should have same sample rate as ``self.sample_rate``.
178
-
179
- Args:
180
- waveform (Tensor): waveform of shape ``(batch_size, time_length)``.
181
- Returns:
182
- Tensor of shape ``(batch_size, n_mel, time_length)``
183
- """
184
- ref_waveform = F.pad(waveform.unsqueeze(1), (self.pad_size, self.pad_size), mode="reflect")
185
- ref_waveform = ref_waveform.squeeze(1)
186
-
187
- spectr = (self.mel_transform.spectrogram(ref_waveform) + 1e-9) ** 0.5
188
- mel_spectrogram = self.mel_transform.mel_scale(spectr)
189
- mel_spectrogram = torch.log(torch.clamp(mel_spectrogram, min=1e-5))
190
- return mel_spectrogram
191
-
192
-
193
- HIFIGAN_VOCODER_V3_LJSPEECH = HiFiGANVocoderBundle(
194
- "hifigan_vocoder_v3_ljspeech.pth",
195
- _vocoder_params={
196
- "upsample_rates": (8, 8, 4),
197
- "upsample_kernel_sizes": (16, 16, 8),
198
- "upsample_initial_channel": 256,
199
- "resblock_kernel_sizes": (3, 5, 7),
200
- "resblock_dilation_sizes": ((1, 2), (2, 6), (3, 12)),
201
- "resblock_type": 2,
202
- "in_channels": 80,
203
- "lrelu_slope": 0.1,
204
- },
205
- _mel_params={
206
- "hop_size": 256,
207
- "n_fft": 1024,
208
- "win_length": 1024,
209
- "f_min": 0,
210
- "f_max": 8000,
211
- },
212
- _sample_rate=22050,
213
- )
214
- HIFIGAN_VOCODER_V3_LJSPEECH.__doc__ = """HiFiGAN Vocoder pipeline, trained on *The LJ Speech Dataset*
215
- :cite:`ljspeech17`.
216
-
217
- This pipeine can be used with an external component which generates mel spectrograms from text, for example,
218
- Tacotron2 - see examples in :py:class:`HiFiGANVocoderBundle`.
219
- Although this works with the existing Tacotron2 bundles, for the best results one needs to retrain Tacotron2
220
- using the same data preprocessing pipeline which was used for training HiFiGAN. In particular, the original
221
- HiFiGAN implementation uses a custom method of generating mel spectrograms from waveforms, different from
222
- :py:class:`torchaudio.transforms.MelSpectrogram`. We reimplemented this transform as
223
- :py:meth:`HiFiGANVocoderBundle.get_mel_transform`, making sure it is equivalent to the original HiFiGAN code `here
224
- <https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/meldataset.py#L49-L72>`_.
225
-
226
- The underlying vocoder is constructed by
227
- :py:func:`torchaudio.prototype.models.hifigan_vocoder`. The weights are converted from the ones published
228
- with the original paper :cite:`NEURIPS2020_c5d73680` under `MIT License
229
- <https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/LICENSE>`__. See links to
230
- pre-trained models on `GitHub <https://github.com/jik876/hifi-gan#pretrained-model>`__.
231
-
232
- Please refer to :py:class:`HiFiGANVocoderBundle` for usage instructions.
233
- """
@@ -1,58 +0,0 @@
1
- from functools import partial
2
-
3
- from torchaudio.models import emformer_rnnt_base
4
- from torchaudio.pipelines import RNNTBundle
5
-
6
-
7
- EMFORMER_RNNT_BASE_MUSTC = RNNTBundle(
8
- _rnnt_path="models/emformer_rnnt_base_mustc.pt",
9
- _rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=501),
10
- _global_stats_path="pipeline-assets/global_stats_rnnt_mustc.json",
11
- _sp_model_path="pipeline-assets/spm_bpe_500_mustc.model",
12
- _right_padding=4,
13
- _blank=500,
14
- _sample_rate=16000,
15
- _n_fft=400,
16
- _n_mels=80,
17
- _hop_length=160,
18
- _segment_length=16,
19
- _right_context_length=4,
20
- )
21
- EMFORMER_RNNT_BASE_MUSTC.__doc__ = """Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both
22
- streaming and non-streaming inference.
23
-
24
- The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
25
- and utilizes weights trained on *MuST-C release v2.0* :cite:`CATTONI2021101155` dataset
26
- using training script ``train.py``
27
- `here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__
28
- with ``num_symbols=501``.
29
-
30
- Please refer to :py:class:`torchaudio.pipelines.RNNTBundle` for usage instructions.
31
- """
32
-
33
-
34
- EMFORMER_RNNT_BASE_TEDLIUM3 = RNNTBundle(
35
- _rnnt_path="models/emformer_rnnt_base_tedlium3.pt",
36
- _rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=501),
37
- _global_stats_path="pipeline-assets/global_stats_rnnt_tedlium3.json",
38
- _sp_model_path="pipeline-assets/spm_bpe_500_tedlium3.model",
39
- _right_padding=4,
40
- _blank=500,
41
- _sample_rate=16000,
42
- _n_fft=400,
43
- _n_mels=80,
44
- _hop_length=160,
45
- _segment_length=16,
46
- _right_context_length=4,
47
- )
48
- EMFORMER_RNNT_BASE_TEDLIUM3.__doc__ = """Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both
49
- streaming and non-streaming inference.
50
-
51
- The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
52
- and utilizes weights trained on *TED-LIUM Release 3* :cite:`rousseau2012tedlium` dataset
53
- using training script ``train.py``
54
- `here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__
55
- with ``num_symbols=501``.
56
-
57
- Please refer to :py:class:`torchaudio.pipelines.RNNTBundle` for usage instructions.
58
- """
@@ -1,9 +0,0 @@
1
- from ._transforms import BarkScale, BarkSpectrogram, ChromaScale, ChromaSpectrogram, InverseBarkScale
2
-
3
- __all__ = [
4
- "BarkScale",
5
- "BarkSpectrogram",
6
- "ChromaScale",
7
- "ChromaSpectrogram",
8
- "InverseBarkScale",
9
- ]