torchaudio 2.7.1__cp311-cp311-win_amd64.whl → 2.9.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show
  1. torchaudio/__init__.py +184 -33
  2. torchaudio/_extension/__init__.py +1 -14
  3. torchaudio/_extension/utils.py +0 -47
  4. torchaudio/_internal/module_utils.py +68 -10
  5. torchaudio/_torchcodec.py +340 -0
  6. torchaudio/datasets/cmuarctic.py +1 -1
  7. torchaudio/datasets/utils.py +1 -1
  8. torchaudio/functional/__init__.py +6 -3
  9. torchaudio/functional/_alignment.py +1 -1
  10. torchaudio/functional/filtering.py +70 -55
  11. torchaudio/functional/functional.py +31 -61
  12. torchaudio/lib/_torchaudio.pyd +0 -0
  13. torchaudio/lib/libtorchaudio.pyd +0 -0
  14. torchaudio/models/decoder/__init__.py +19 -1
  15. torchaudio/models/decoder/_ctc_decoder.py +6 -6
  16. torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
  17. torchaudio/models/squim/objective.py +2 -2
  18. torchaudio/pipelines/_source_separation_pipeline.py +1 -1
  19. torchaudio/pipelines/_squim_pipeline.py +2 -2
  20. torchaudio/pipelines/_tts/utils.py +3 -1
  21. torchaudio/pipelines/rnnt_pipeline.py +4 -4
  22. torchaudio/transforms/__init__.py +4 -1
  23. torchaudio/transforms/_transforms.py +4 -3
  24. torchaudio/utils/__init__.py +2 -9
  25. torchaudio/utils/download.py +1 -1
  26. torchaudio/version.py +2 -2
  27. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/METADATA +15 -7
  28. torchaudio-2.9.0.dist-info/RECORD +85 -0
  29. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
  30. torchaudio/_backend/__init__.py +0 -61
  31. torchaudio/_backend/backend.py +0 -53
  32. torchaudio/_backend/common.py +0 -52
  33. torchaudio/_backend/ffmpeg.py +0 -334
  34. torchaudio/_backend/soundfile.py +0 -54
  35. torchaudio/_backend/soundfile_backend.py +0 -457
  36. torchaudio/_backend/sox.py +0 -91
  37. torchaudio/_backend/utils.py +0 -317
  38. torchaudio/backend/__init__.py +0 -8
  39. torchaudio/backend/_no_backend.py +0 -25
  40. torchaudio/backend/_sox_io_backend.py +0 -294
  41. torchaudio/backend/common.py +0 -13
  42. torchaudio/backend/no_backend.py +0 -14
  43. torchaudio/backend/soundfile_backend.py +0 -14
  44. torchaudio/backend/sox_io_backend.py +0 -14
  45. torchaudio/io/__init__.py +0 -13
  46. torchaudio/io/_effector.py +0 -347
  47. torchaudio/io/_playback.py +0 -72
  48. torchaudio/kaldi_io.py +0 -144
  49. torchaudio/prototype/__init__.py +0 -0
  50. torchaudio/prototype/datasets/__init__.py +0 -4
  51. torchaudio/prototype/datasets/musan.py +0 -67
  52. torchaudio/prototype/functional/__init__.py +0 -26
  53. torchaudio/prototype/functional/_dsp.py +0 -433
  54. torchaudio/prototype/functional/_rir.py +0 -379
  55. torchaudio/prototype/functional/functional.py +0 -190
  56. torchaudio/prototype/models/__init__.py +0 -36
  57. torchaudio/prototype/models/_conformer_wav2vec2.py +0 -794
  58. torchaudio/prototype/models/_emformer_hubert.py +0 -333
  59. torchaudio/prototype/models/conv_emformer.py +0 -525
  60. torchaudio/prototype/models/hifi_gan.py +0 -336
  61. torchaudio/prototype/models/rnnt.py +0 -711
  62. torchaudio/prototype/models/rnnt_decoder.py +0 -399
  63. torchaudio/prototype/pipelines/__init__.py +0 -12
  64. torchaudio/prototype/pipelines/_vggish/__init__.py +0 -3
  65. torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -233
  66. torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -82
  67. torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -228
  68. torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
  69. torchaudio/prototype/transforms/__init__.py +0 -9
  70. torchaudio/prototype/transforms/_transforms.py +0 -456
  71. torchaudio/sox_effects/__init__.py +0 -10
  72. torchaudio/sox_effects/sox_effects.py +0 -272
  73. torchaudio/utils/ffmpeg_utils.py +0 -11
  74. torchaudio/utils/sox_utils.py +0 -99
  75. torchaudio-2.7.1.dist-info/RECORD +0 -144
  76. torio/__init__.py +0 -8
  77. torio/_extension/__init__.py +0 -13
  78. torio/_extension/utils.py +0 -147
  79. torio/io/__init__.py +0 -9
  80. torio/io/_streaming_media_decoder.py +0 -978
  81. torio/io/_streaming_media_encoder.py +0 -502
  82. torio/lib/__init__.py +0 -0
  83. torio/lib/_torio_ffmpeg4.pyd +0 -0
  84. torio/lib/_torio_ffmpeg5.pyd +0 -0
  85. torio/lib/_torio_ffmpeg6.pyd +0 -0
  86. torio/lib/libtorio_ffmpeg4.pyd +0 -0
  87. torio/lib/libtorio_ffmpeg5.pyd +0 -0
  88. torio/lib/libtorio_ffmpeg6.pyd +0 -0
  89. torio/utils/__init__.py +0 -4
  90. torio/utils/ffmpeg_utils.py +0 -247
  91. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
  92. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,82 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Callable, Dict
3
-
4
- import torch
5
- import torchaudio
6
-
7
- from ._vggish_impl import _SAMPLE_RATE, VGGish as _VGGish, VGGishInputProcessor as _VGGishInputProcessor
8
-
9
-
10
- def _get_state_dict():
11
- path = torchaudio.utils.download_asset("models/vggish.pt")
12
- return torch.load(path)
13
-
14
-
15
- @dataclass
16
- class VGGishBundle:
17
- """VGGish :cite:`45611` inference pipeline ported from
18
- `torchvggish <https://github.com/harritaylor/torchvggish>`__
19
- and `tensorflow-models <https://github.com/tensorflow/models/tree/master/research/audioset>`__.
20
-
21
- Example:
22
- >>> import torchaudio
23
- >>> from torchaudio.prototype.pipelines import VGGISH
24
- >>>
25
- >>> input_sr = VGGISH.sample_rate
26
- >>> input_proc = VGGISH.get_input_processor()
27
- >>> model = VGGISH.get_model()
28
- >>>
29
- >>> waveform, sr = torchaudio.load(
30
- >>> "Chopin_Ballade_-1_In_G_Minor,_Op._23.mp3",
31
- >>> )
32
- >>> waveform = waveform.squeeze(0)
33
- >>> waveform = torchaudio.functional.resample(waveform, sr, input_sr)
34
- >>> mono_output = model(input_proc(waveform))
35
- """
36
-
37
- class VGGish(_VGGish):
38
- __doc__ = _VGGish.__doc__
39
-
40
- class VGGishInputProcessor(_VGGishInputProcessor):
41
- __doc__ = _VGGishInputProcessor.__doc__
42
-
43
- _state_dict_func: Callable[[], Dict]
44
-
45
- @property
46
- def sample_rate(self) -> int:
47
- """Sample rate of input waveform expected by input processor and model.
48
-
49
- :type: int
50
- """
51
- return _SAMPLE_RATE
52
-
53
- def get_model(self) -> VGGish:
54
- """Constructs pre-trained VGGish model. Downloads and caches weights as necessary.
55
-
56
- Returns:
57
- VGGish: VGGish model with pre-trained weights loaded.
58
- """
59
- model = self.VGGish()
60
- state_dict = self._state_dict_func()
61
- model.load_state_dict(state_dict)
62
- model.eval()
63
- return model
64
-
65
- def get_input_processor(self) -> VGGishInputProcessor:
66
- """Constructs input processor for VGGish.
67
-
68
- Returns:
69
- VGGishInputProcessor: input processor for VGGish.
70
- """
71
- return self.VGGishInputProcessor()
72
-
73
-
74
- VGGISH = VGGishBundle(_get_state_dict)
75
- VGGISH.__doc__ = """Pre-trained VGGish :cite:`45611` inference pipeline ported from
76
- `torchvggish <https://github.com/harritaylor/torchvggish>`__
77
- and `tensorflow-models <https://github.com/tensorflow/models/tree/master/research/audioset>`__.
78
-
79
- Per the `documentation <https://github.com/tensorflow/models/tree/master/research/audioset/vggish>`__
80
- for the original model, the model is "trained on a large YouTube dataset (a preliminary version of
81
- what later became YouTube-8M)".
82
- """
@@ -1,228 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Any, Dict, Optional
3
-
4
- import torch
5
- import torch.nn.functional as F
6
- from torch.nn import Module
7
- from torchaudio._internal import load_state_dict_from_url
8
-
9
- from torchaudio.prototype.models.hifi_gan import hifigan_vocoder, HiFiGANVocoder
10
- from torchaudio.transforms import MelSpectrogram
11
-
12
-
13
- @dataclass
14
- class HiFiGANVocoderBundle:
15
- """Data class that bundles associated information to use pretrained
16
- :py:class:`~torchaudio.prototype.models.HiFiGANVocoder`.
17
-
18
- This class provides interfaces for instantiating the pretrained model along with
19
- the information necessary to retrieve pretrained weights and additional data
20
- to be used with the model.
21
-
22
- Torchaudio library instantiates objects of this class, each of which represents
23
- a different pretrained model. Client code should access pretrained models via these
24
- instances.
25
-
26
- This bundle can convert mel spectrorgam to waveforms and vice versa. A typical use case would be a flow like
27
- `text -> mel spectrogram -> waveform`, where one can use an external component, e.g. Tacotron2,
28
- to generate mel spectrogram from text. Please see below for the code example.
29
-
30
- Example: Transform synthetic mel spectrogram to audio.
31
- >>> import torch
32
- >>> import torchaudio
33
- >>> # Since HiFiGAN bundle is in prototypes, it needs to be exported explicitly
34
- >>> from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle
35
- >>>
36
- >>> # Load the HiFiGAN bundle
37
- >>> vocoder = bundle.get_vocoder()
38
- Downloading: "https://download.pytorch.org/torchaudio/models/hifigan_vocoder_v3_ljspeech.pth"
39
- 100%|████████████| 5.59M/5.59M [00:00<00:00, 18.7MB/s]
40
- >>>
41
- >>> # Generate synthetic mel spectrogram
42
- >>> specgram = torch.sin(0.5 * torch.arange(start=0, end=100)).expand(bundle._vocoder_params["in_channels"], 100)
43
- >>>
44
- >>> # Transform mel spectrogram into audio
45
- >>> waveform = vocoder(specgram)
46
- >>> torchaudio.save('sample.wav', waveform, bundle.sample_rate)
47
-
48
- Example: Usage together with Tacotron2, text to audio.
49
- >>> import torch
50
- >>> import torchaudio
51
- >>> # Since HiFiGAN bundle is in prototypes, it needs to be exported explicitly
52
- >>> from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle_hifigan
53
- >>>
54
- >>> # Load Tacotron2 bundle
55
- >>> bundle_tactron2 = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
56
- >>> processor = bundle_tactron2.get_text_processor()
57
- >>> tacotron2 = bundle_tactron2.get_tacotron2()
58
- >>>
59
- >>> # Use Tacotron2 to convert text to mel spectrogram
60
- >>> text = "A quick brown fox jumped over a lazy dog"
61
- >>> input, lengths = processor(text)
62
- >>> specgram, lengths, _ = tacotron2.infer(input, lengths)
63
- >>>
64
- >>> # Load HiFiGAN bundle
65
- >>> vocoder = bundle_hifigan.get_vocoder()
66
- Downloading: "https://download.pytorch.org/torchaudio/models/hifigan_vocoder_v3_ljspeech.pth"
67
- 100%|████████████| 5.59M/5.59M [00:03<00:00, 1.55MB/s]
68
- >>>
69
- >>> # Use HiFiGAN to convert mel spectrogram to audio
70
- >>> waveform = vocoder(specgram).squeeze(0)
71
- >>> torchaudio.save('sample.wav', waveform, bundle_hifigan.sample_rate)
72
- """ # noqa: E501
73
-
74
- _path: str
75
- _vocoder_params: Dict[str, Any] # Vocoder parameters
76
- _mel_params: Dict[str, Any] # Mel transformation parameters
77
- _sample_rate: float
78
-
79
- def _get_state_dict(self, dl_kwargs):
80
- url = f"https://download.pytorch.org/torchaudio/models/{self._path}"
81
- dl_kwargs = {} if dl_kwargs is None else dl_kwargs
82
- state_dict = load_state_dict_from_url(url, **dl_kwargs)
83
- return state_dict
84
-
85
- def get_vocoder(self, *, dl_kwargs=None) -> HiFiGANVocoder:
86
- """Construct the HiFiGAN Generator model, which can be used a vocoder, and load the pretrained weight.
87
-
88
- The weight file is downloaded from the internet and cached with
89
- :func:`torch.hub.load_state_dict_from_url`
90
-
91
- Args:
92
- dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
93
-
94
- Returns:
95
- Variation of :py:class:`~torchaudio.prototype.models.HiFiGANVocoder`.
96
- """
97
- model = hifigan_vocoder(**self._vocoder_params)
98
- model.load_state_dict(self._get_state_dict(dl_kwargs))
99
- model.eval()
100
- return model
101
-
102
- def get_mel_transform(self) -> Module:
103
- """Construct an object which transforms waveforms into mel spectrograms."""
104
- return _HiFiGANMelSpectrogram(
105
- n_mels=self._vocoder_params["in_channels"],
106
- sample_rate=self._sample_rate,
107
- **self._mel_params,
108
- )
109
-
110
- @property
111
- def sample_rate(self):
112
- """Sample rate of the audio that the model is trained on.
113
-
114
- :type: float
115
- """
116
- return self._sample_rate
117
-
118
-
119
- class _HiFiGANMelSpectrogram(torch.nn.Module):
120
- """
121
- Generate mel spectrogram in a way equivalent to the original HiFiGAN implementation:
122
- https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/meldataset.py#L49-L72
123
-
124
- This class wraps around :py:class:`torchaudio.transforms.MelSpectrogram`, but performs extra steps to achive
125
- equivalence with the HiFiGAN implementation.
126
-
127
- Args:
128
- hop_size (int): Length of hop between STFT windows.
129
- n_fft (int): Size of FFT, creates ``n_fft // 2 + 1`` bins.
130
- win_length (int): Window size.
131
- f_min (float or None): Minimum frequency.
132
- f_max (float or None): Maximum frequency.
133
- sample_rate (int): Sample rate of audio signal.
134
- n_mels (int): Number of mel filterbanks.
135
- """
136
-
137
- def __init__(
138
- self,
139
- hop_size: int,
140
- n_fft: int,
141
- win_length: int,
142
- f_min: Optional[float],
143
- f_max: Optional[float],
144
- sample_rate: float,
145
- n_mels: int,
146
- ):
147
- super(_HiFiGANMelSpectrogram, self).__init__()
148
- self.mel_transform = MelSpectrogram(
149
- sample_rate=sample_rate,
150
- n_fft=n_fft,
151
- win_length=win_length,
152
- hop_length=hop_size,
153
- f_min=f_min,
154
- f_max=f_max,
155
- n_mels=n_mels,
156
- normalized=False,
157
- pad=0,
158
- mel_scale="slaney",
159
- norm="slaney",
160
- center=False,
161
- )
162
- self.sample_rate = sample_rate
163
- self.hop_size = hop_size
164
- self.n_fft = n_fft
165
- self.win_length = win_length
166
- self.f_min = f_min
167
- self.f_max = f_max
168
- self.n_mels = n_mels
169
- self.pad_size = int((n_fft - hop_size) / 2)
170
-
171
- def forward(self, waveform: torch.Tensor) -> torch.Tensor:
172
- """Generate mel spectrogram from a waveform. Should have same sample rate as ``self.sample_rate``.
173
-
174
- Args:
175
- waveform (Tensor): waveform of shape ``(batch_size, time_length)``.
176
- Returns:
177
- Tensor of shape ``(batch_size, n_mel, time_length)``
178
- """
179
- ref_waveform = F.pad(waveform.unsqueeze(1), (self.pad_size, self.pad_size), mode="reflect")
180
- ref_waveform = ref_waveform.squeeze(1)
181
-
182
- spectr = (self.mel_transform.spectrogram(ref_waveform) + 1e-9) ** 0.5
183
- mel_spectrogram = self.mel_transform.mel_scale(spectr)
184
- mel_spectrogram = torch.log(torch.clamp(mel_spectrogram, min=1e-5))
185
- return mel_spectrogram
186
-
187
-
188
- HIFIGAN_VOCODER_V3_LJSPEECH = HiFiGANVocoderBundle(
189
- "hifigan_vocoder_v3_ljspeech.pth",
190
- _vocoder_params={
191
- "upsample_rates": (8, 8, 4),
192
- "upsample_kernel_sizes": (16, 16, 8),
193
- "upsample_initial_channel": 256,
194
- "resblock_kernel_sizes": (3, 5, 7),
195
- "resblock_dilation_sizes": ((1, 2), (2, 6), (3, 12)),
196
- "resblock_type": 2,
197
- "in_channels": 80,
198
- "lrelu_slope": 0.1,
199
- },
200
- _mel_params={
201
- "hop_size": 256,
202
- "n_fft": 1024,
203
- "win_length": 1024,
204
- "f_min": 0,
205
- "f_max": 8000,
206
- },
207
- _sample_rate=22050,
208
- )
209
- HIFIGAN_VOCODER_V3_LJSPEECH.__doc__ = """HiFiGAN Vocoder pipeline, trained on *The LJ Speech Dataset*
210
- :cite:`ljspeech17`.
211
-
212
- This pipeine can be used with an external component which generates mel spectrograms from text, for example,
213
- Tacotron2 - see examples in :py:class:`HiFiGANVocoderBundle`.
214
- Although this works with the existing Tacotron2 bundles, for the best results one needs to retrain Tacotron2
215
- using the same data preprocessing pipeline which was used for training HiFiGAN. In particular, the original
216
- HiFiGAN implementation uses a custom method of generating mel spectrograms from waveforms, different from
217
- :py:class:`torchaudio.transforms.MelSpectrogram`. We reimplemented this transform as
218
- :py:meth:`HiFiGANVocoderBundle.get_mel_transform`, making sure it is equivalent to the original HiFiGAN code `here
219
- <https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/meldataset.py#L49-L72>`_.
220
-
221
- The underlying vocoder is constructed by
222
- :py:func:`torchaudio.prototype.models.hifigan_vocoder`. The weights are converted from the ones published
223
- with the original paper :cite:`NEURIPS2020_c5d73680` under `MIT License
224
- <https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/LICENSE>`__. See links to
225
- pre-trained models on `GitHub <https://github.com/jik876/hifi-gan#pretrained-model>`__.
226
-
227
- Please refer to :py:class:`HiFiGANVocoderBundle` for usage instructions.
228
- """
@@ -1,58 +0,0 @@
1
- from functools import partial
2
-
3
- from torchaudio.models import emformer_rnnt_base
4
- from torchaudio.pipelines import RNNTBundle
5
-
6
-
7
- EMFORMER_RNNT_BASE_MUSTC = RNNTBundle(
8
- _rnnt_path="models/emformer_rnnt_base_mustc.pt",
9
- _rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=501),
10
- _global_stats_path="pipeline-assets/global_stats_rnnt_mustc.json",
11
- _sp_model_path="pipeline-assets/spm_bpe_500_mustc.model",
12
- _right_padding=4,
13
- _blank=500,
14
- _sample_rate=16000,
15
- _n_fft=400,
16
- _n_mels=80,
17
- _hop_length=160,
18
- _segment_length=16,
19
- _right_context_length=4,
20
- )
21
- EMFORMER_RNNT_BASE_MUSTC.__doc__ = """Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both
22
- streaming and non-streaming inference.
23
-
24
- The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
25
- and utilizes weights trained on *MuST-C release v2.0* :cite:`CATTONI2021101155` dataset
26
- using training script ``train.py``
27
- `here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__
28
- with ``num_symbols=501``.
29
-
30
- Please refer to :py:class:`torchaudio.pipelines.RNNTBundle` for usage instructions.
31
- """
32
-
33
-
34
- EMFORMER_RNNT_BASE_TEDLIUM3 = RNNTBundle(
35
- _rnnt_path="models/emformer_rnnt_base_tedlium3.pt",
36
- _rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=501),
37
- _global_stats_path="pipeline-assets/global_stats_rnnt_tedlium3.json",
38
- _sp_model_path="pipeline-assets/spm_bpe_500_tedlium3.model",
39
- _right_padding=4,
40
- _blank=500,
41
- _sample_rate=16000,
42
- _n_fft=400,
43
- _n_mels=80,
44
- _hop_length=160,
45
- _segment_length=16,
46
- _right_context_length=4,
47
- )
48
- EMFORMER_RNNT_BASE_TEDLIUM3.__doc__ = """Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both
49
- streaming and non-streaming inference.
50
-
51
- The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
52
- and utilizes weights trained on *TED-LIUM Release 3* :cite:`rousseau2012tedlium` dataset
53
- using training script ``train.py``
54
- `here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__
55
- with ``num_symbols=501``.
56
-
57
- Please refer to :py:class:`torchaudio.pipelines.RNNTBundle` for usage instructions.
58
- """
@@ -1,9 +0,0 @@
1
- from ._transforms import BarkScale, BarkSpectrogram, ChromaScale, ChromaSpectrogram, InverseBarkScale
2
-
3
- __all__ = [
4
- "BarkScale",
5
- "BarkSpectrogram",
6
- "ChromaScale",
7
- "ChromaSpectrogram",
8
- "InverseBarkScale",
9
- ]