torchaudio 2.6.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (144) hide show
  1. torchaudio/__init__.py +53 -0
  2. torchaudio/_backend/__init__.py +61 -0
  3. torchaudio/_backend/backend.py +53 -0
  4. torchaudio/_backend/common.py +52 -0
  5. torchaudio/_backend/ffmpeg.py +334 -0
  6. torchaudio/_backend/soundfile.py +54 -0
  7. torchaudio/_backend/soundfile_backend.py +457 -0
  8. torchaudio/_backend/sox.py +91 -0
  9. torchaudio/_backend/utils.py +317 -0
  10. torchaudio/_extension/__init__.py +74 -0
  11. torchaudio/_extension/utils.py +180 -0
  12. torchaudio/_internal/__init__.py +10 -0
  13. torchaudio/_internal/module_utils.py +113 -0
  14. torchaudio/backend/__init__.py +8 -0
  15. torchaudio/backend/_no_backend.py +25 -0
  16. torchaudio/backend/_sox_io_backend.py +294 -0
  17. torchaudio/backend/common.py +13 -0
  18. torchaudio/backend/no_backend.py +14 -0
  19. torchaudio/backend/soundfile_backend.py +14 -0
  20. torchaudio/backend/sox_io_backend.py +14 -0
  21. torchaudio/compliance/__init__.py +5 -0
  22. torchaudio/compliance/kaldi.py +813 -0
  23. torchaudio/datasets/__init__.py +47 -0
  24. torchaudio/datasets/cmuarctic.py +157 -0
  25. torchaudio/datasets/cmudict.py +186 -0
  26. torchaudio/datasets/commonvoice.py +86 -0
  27. torchaudio/datasets/dr_vctk.py +121 -0
  28. torchaudio/datasets/fluentcommands.py +108 -0
  29. torchaudio/datasets/gtzan.py +1118 -0
  30. torchaudio/datasets/iemocap.py +147 -0
  31. torchaudio/datasets/librilight_limited.py +111 -0
  32. torchaudio/datasets/librimix.py +133 -0
  33. torchaudio/datasets/librispeech.py +174 -0
  34. torchaudio/datasets/librispeech_biasing.py +189 -0
  35. torchaudio/datasets/libritts.py +168 -0
  36. torchaudio/datasets/ljspeech.py +107 -0
  37. torchaudio/datasets/musdb_hq.py +139 -0
  38. torchaudio/datasets/quesst14.py +136 -0
  39. torchaudio/datasets/snips.py +157 -0
  40. torchaudio/datasets/speechcommands.py +183 -0
  41. torchaudio/datasets/tedlium.py +218 -0
  42. torchaudio/datasets/utils.py +54 -0
  43. torchaudio/datasets/vctk.py +143 -0
  44. torchaudio/datasets/voxceleb1.py +309 -0
  45. torchaudio/datasets/yesno.py +89 -0
  46. torchaudio/functional/__init__.py +127 -0
  47. torchaudio/functional/_alignment.py +128 -0
  48. torchaudio/functional/filtering.py +1670 -0
  49. torchaudio/functional/functional.py +2535 -0
  50. torchaudio/io/__init__.py +13 -0
  51. torchaudio/io/_effector.py +347 -0
  52. torchaudio/io/_playback.py +72 -0
  53. torchaudio/kaldi_io.py +144 -0
  54. torchaudio/lib/__init__.py +0 -0
  55. torchaudio/lib/_torchaudio.pyd +0 -0
  56. torchaudio/lib/libtorchaudio.pyd +0 -0
  57. torchaudio/models/__init__.py +85 -0
  58. torchaudio/models/_hdemucs.py +1008 -0
  59. torchaudio/models/conformer.py +293 -0
  60. torchaudio/models/conv_tasnet.py +330 -0
  61. torchaudio/models/decoder/__init__.py +46 -0
  62. torchaudio/models/decoder/_ctc_decoder.py +568 -0
  63. torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
  64. torchaudio/models/deepspeech.py +84 -0
  65. torchaudio/models/emformer.py +884 -0
  66. torchaudio/models/rnnt.py +816 -0
  67. torchaudio/models/rnnt_decoder.py +339 -0
  68. torchaudio/models/squim/__init__.py +11 -0
  69. torchaudio/models/squim/objective.py +326 -0
  70. torchaudio/models/squim/subjective.py +150 -0
  71. torchaudio/models/tacotron2.py +1046 -0
  72. torchaudio/models/wav2letter.py +72 -0
  73. torchaudio/models/wav2vec2/__init__.py +45 -0
  74. torchaudio/models/wav2vec2/components.py +1167 -0
  75. torchaudio/models/wav2vec2/model.py +1579 -0
  76. torchaudio/models/wav2vec2/utils/__init__.py +7 -0
  77. torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
  78. torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
  79. torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
  80. torchaudio/models/wavernn.py +409 -0
  81. torchaudio/pipelines/__init__.py +102 -0
  82. torchaudio/pipelines/_source_separation_pipeline.py +109 -0
  83. torchaudio/pipelines/_squim_pipeline.py +156 -0
  84. torchaudio/pipelines/_tts/__init__.py +16 -0
  85. torchaudio/pipelines/_tts/impl.py +385 -0
  86. torchaudio/pipelines/_tts/interface.py +255 -0
  87. torchaudio/pipelines/_tts/utils.py +228 -0
  88. torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
  89. torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
  90. torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
  91. torchaudio/pipelines/_wav2vec2/utils.py +346 -0
  92. torchaudio/pipelines/rnnt_pipeline.py +380 -0
  93. torchaudio/prototype/__init__.py +0 -0
  94. torchaudio/prototype/datasets/__init__.py +4 -0
  95. torchaudio/prototype/datasets/musan.py +67 -0
  96. torchaudio/prototype/functional/__init__.py +26 -0
  97. torchaudio/prototype/functional/_dsp.py +433 -0
  98. torchaudio/prototype/functional/_rir.py +379 -0
  99. torchaudio/prototype/functional/functional.py +190 -0
  100. torchaudio/prototype/models/__init__.py +36 -0
  101. torchaudio/prototype/models/_conformer_wav2vec2.py +794 -0
  102. torchaudio/prototype/models/_emformer_hubert.py +333 -0
  103. torchaudio/prototype/models/conv_emformer.py +525 -0
  104. torchaudio/prototype/models/hifi_gan.py +336 -0
  105. torchaudio/prototype/models/rnnt.py +711 -0
  106. torchaudio/prototype/models/rnnt_decoder.py +399 -0
  107. torchaudio/prototype/pipelines/__init__.py +12 -0
  108. torchaudio/prototype/pipelines/_vggish/__init__.py +3 -0
  109. torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +233 -0
  110. torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +82 -0
  111. torchaudio/prototype/pipelines/hifigan_pipeline.py +228 -0
  112. torchaudio/prototype/pipelines/rnnt_pipeline.py +58 -0
  113. torchaudio/prototype/transforms/__init__.py +9 -0
  114. torchaudio/prototype/transforms/_transforms.py +456 -0
  115. torchaudio/sox_effects/__init__.py +10 -0
  116. torchaudio/sox_effects/sox_effects.py +272 -0
  117. torchaudio/transforms/__init__.py +75 -0
  118. torchaudio/transforms/_multi_channel.py +467 -0
  119. torchaudio/transforms/_transforms.py +2137 -0
  120. torchaudio/utils/__init__.py +11 -0
  121. torchaudio/utils/download.py +89 -0
  122. torchaudio/utils/ffmpeg_utils.py +11 -0
  123. torchaudio/utils/sox_utils.py +99 -0
  124. torchaudio/version.py +2 -0
  125. torchaudio-2.6.0.dist-info/LICENSE +25 -0
  126. torchaudio-2.6.0.dist-info/METADATA +124 -0
  127. torchaudio-2.6.0.dist-info/RECORD +144 -0
  128. torchaudio-2.6.0.dist-info/WHEEL +5 -0
  129. torchaudio-2.6.0.dist-info/top_level.txt +2 -0
  130. torio/__init__.py +8 -0
  131. torio/_extension/__init__.py +13 -0
  132. torio/_extension/utils.py +147 -0
  133. torio/io/__init__.py +9 -0
  134. torio/io/_streaming_media_decoder.py +978 -0
  135. torio/io/_streaming_media_encoder.py +502 -0
  136. torio/lib/__init__.py +0 -0
  137. torio/lib/_torio_ffmpeg4.pyd +0 -0
  138. torio/lib/_torio_ffmpeg5.pyd +0 -0
  139. torio/lib/_torio_ffmpeg6.pyd +0 -0
  140. torio/lib/libtorio_ffmpeg4.pyd +0 -0
  141. torio/lib/libtorio_ffmpeg5.pyd +0 -0
  142. torio/lib/libtorio_ffmpeg6.pyd +0 -0
  143. torio/utils/__init__.py +4 -0
  144. torio/utils/ffmpeg_utils.py +247 -0
@@ -0,0 +1,2137 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import math
4
+ import warnings
5
+ from typing import Callable, Optional, Sequence, Tuple, Union
6
+
7
+ import torch
8
+ from torch import Tensor
9
+ from torch.nn.modules.lazy import LazyModuleMixin
10
+ from torch.nn.parameter import UninitializedParameter
11
+
12
+ from torchaudio import functional as F
13
+ from torchaudio.functional.functional import (
14
+ _apply_sinc_resample_kernel,
15
+ _check_convolve_mode,
16
+ _fix_waveform_shape,
17
+ _get_sinc_resample_kernel,
18
+ _stretch_waveform,
19
+ )
20
+
21
+ __all__ = []
22
+
23
+
24
+ class Spectrogram(torch.nn.Module):
25
+ r"""Create a spectrogram from a audio signal.
26
+
27
+ .. devices:: CPU CUDA
28
+
29
+ .. properties:: Autograd TorchScript
30
+
31
+ Args:
32
+ n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
33
+ win_length (int or None, optional): Window size. (Default: ``n_fft``)
34
+ hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
35
+ pad (int, optional): Two sided padding of signal. (Default: ``0``)
36
+ window_fn (Callable[..., Tensor], optional): A function to create a window tensor
37
+ that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
38
+ power (float or None, optional): Exponent for the magnitude spectrogram,
39
+ (must be > 0) e.g., 1 for magnitude, 2 for power, etc.
40
+ If None, then the complex spectrum is returned instead. (Default: ``2``)
41
+ normalized (bool or str, optional): Whether to normalize by magnitude after stft. If input is str, choices are
42
+ ``"window"`` and ``"frame_length"``, if specific normalization type is desirable. ``True`` maps to
43
+ ``"window"``. (Default: ``False``)
44
+ wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
45
+ center (bool, optional): whether to pad :attr:`waveform` on both sides so
46
+ that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
47
+ (Default: ``True``)
48
+ pad_mode (string, optional): controls the padding method used when
49
+ :attr:`center` is ``True``. (Default: ``"reflect"``)
50
+ onesided (bool, optional): controls whether to return half of results to
51
+ avoid redundancy (Default: ``True``)
52
+ return_complex (bool, optional):
53
+ Deprecated and not used.
54
+
55
+ Example
56
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
57
+ >>> transform = torchaudio.transforms.Spectrogram(n_fft=800)
58
+ >>> spectrogram = transform(waveform)
59
+
60
+ """
61
+ __constants__ = ["n_fft", "win_length", "hop_length", "pad", "power", "normalized"]
62
+
63
+ def __init__(
64
+ self,
65
+ n_fft: int = 400,
66
+ win_length: Optional[int] = None,
67
+ hop_length: Optional[int] = None,
68
+ pad: int = 0,
69
+ window_fn: Callable[..., Tensor] = torch.hann_window,
70
+ power: Optional[float] = 2.0,
71
+ normalized: Union[bool, str] = False,
72
+ wkwargs: Optional[dict] = None,
73
+ center: bool = True,
74
+ pad_mode: str = "reflect",
75
+ onesided: bool = True,
76
+ return_complex: Optional[bool] = None,
77
+ ) -> None:
78
+ super(Spectrogram, self).__init__()
79
+ torch._C._log_api_usage_once("torchaudio.transforms.Spectrogram")
80
+ self.n_fft = n_fft
81
+ # number of FFT bins. the returned STFT result will have n_fft // 2 + 1
82
+ # number of frequencies due to onesided=True in torch.stft
83
+ self.win_length = win_length if win_length is not None else n_fft
84
+ self.hop_length = hop_length if hop_length is not None else self.win_length // 2
85
+ window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
86
+ self.register_buffer("window", window)
87
+ self.pad = pad
88
+ self.power = power
89
+ self.normalized = normalized
90
+ self.center = center
91
+ self.pad_mode = pad_mode
92
+ self.onesided = onesided
93
+ if return_complex is not None:
94
+ warnings.warn(
95
+ "`return_complex` argument is now deprecated and is not effective."
96
+ "`torchaudio.transforms.Spectrogram(power=None)` always returns a tensor with "
97
+ "complex dtype. Please remove the argument in the function call."
98
+ )
99
+
100
+ def forward(self, waveform: Tensor) -> Tensor:
101
+ r"""
102
+ Args:
103
+ waveform (Tensor): Tensor of audio of dimension (..., time).
104
+
105
+ Returns:
106
+ Tensor: Dimension (..., freq, time), where freq is
107
+ ``n_fft // 2 + 1`` where ``n_fft`` is the number of
108
+ Fourier bins, and time is the number of window hops (n_frame).
109
+ """
110
+ return F.spectrogram(
111
+ waveform,
112
+ self.pad,
113
+ self.window,
114
+ self.n_fft,
115
+ self.hop_length,
116
+ self.win_length,
117
+ self.power,
118
+ self.normalized,
119
+ self.center,
120
+ self.pad_mode,
121
+ self.onesided,
122
+ )
123
+
124
+
125
+ class InverseSpectrogram(torch.nn.Module):
126
+ r"""Create an inverse spectrogram to recover an audio signal from a spectrogram.
127
+
128
+ .. devices:: CPU CUDA
129
+
130
+ .. properties:: Autograd TorchScript
131
+
132
+ Args:
133
+ n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
134
+ win_length (int or None, optional): Window size. (Default: ``n_fft``)
135
+ hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
136
+ pad (int, optional): Two sided padding of signal. (Default: ``0``)
137
+ window_fn (Callable[..., Tensor], optional): A function to create a window tensor
138
+ that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
139
+ normalized (bool or str, optional): Whether the stft output was normalized by magnitude. If input is str,
140
+ choices are ``"window"`` and ``"frame_length"``, dependent on normalization mode. ``True`` maps to
141
+ ``"window"``. (Default: ``False``)
142
+ wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
143
+ center (bool, optional): whether the signal in spectrogram was padded on both sides so
144
+ that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
145
+ (Default: ``True``)
146
+ pad_mode (string, optional): controls the padding method used when
147
+ :attr:`center` is ``True``. (Default: ``"reflect"``)
148
+ onesided (bool, optional): controls whether spectrogram was used to return half of results to
149
+ avoid redundancy (Default: ``True``)
150
+
151
+ Example
152
+ >>> batch, freq, time = 2, 257, 100
153
+ >>> length = 25344
154
+ >>> spectrogram = torch.randn(batch, freq, time, dtype=torch.cdouble)
155
+ >>> transform = transforms.InverseSpectrogram(n_fft=512)
156
+ >>> waveform = transform(spectrogram, length)
157
+ """
158
+ __constants__ = ["n_fft", "win_length", "hop_length", "pad", "power", "normalized"]
159
+
160
+ def __init__(
161
+ self,
162
+ n_fft: int = 400,
163
+ win_length: Optional[int] = None,
164
+ hop_length: Optional[int] = None,
165
+ pad: int = 0,
166
+ window_fn: Callable[..., Tensor] = torch.hann_window,
167
+ normalized: Union[bool, str] = False,
168
+ wkwargs: Optional[dict] = None,
169
+ center: bool = True,
170
+ pad_mode: str = "reflect",
171
+ onesided: bool = True,
172
+ ) -> None:
173
+ super(InverseSpectrogram, self).__init__()
174
+ self.n_fft = n_fft
175
+ # number of FFT bins. the returned STFT result will have n_fft // 2 + 1
176
+ # number of frequencies due to onesided=True in torch.stft
177
+ self.win_length = win_length if win_length is not None else n_fft
178
+ self.hop_length = hop_length if hop_length is not None else self.win_length // 2
179
+ window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
180
+ self.register_buffer("window", window)
181
+ self.pad = pad
182
+ self.normalized = normalized
183
+ self.center = center
184
+ self.pad_mode = pad_mode
185
+ self.onesided = onesided
186
+
187
+ def forward(self, spectrogram: Tensor, length: Optional[int] = None) -> Tensor:
188
+ r"""
189
+ Args:
190
+ spectrogram (Tensor): Complex tensor of audio of dimension (..., freq, time).
191
+ length (int or None, optional): The output length of the waveform.
192
+
193
+ Returns:
194
+ Tensor: Dimension (..., time), Least squares estimation of the original signal.
195
+ """
196
+ return F.inverse_spectrogram(
197
+ spectrogram,
198
+ length,
199
+ self.pad,
200
+ self.window,
201
+ self.n_fft,
202
+ self.hop_length,
203
+ self.win_length,
204
+ self.normalized,
205
+ self.center,
206
+ self.pad_mode,
207
+ self.onesided,
208
+ )
209
+
210
+
211
+ class GriffinLim(torch.nn.Module):
212
+ r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
213
+
214
+ .. devices:: CPU CUDA
215
+
216
+ .. properties:: Autograd TorchScript
217
+
218
+ Implementation ported from
219
+ *librosa* :cite:`brian_mcfee-proc-scipy-2015`, *A fast Griffin-Lim algorithm* :cite:`6701851`
220
+ and *Signal estimation from modified short-time Fourier transform* :cite:`1172092`.
221
+
222
+ Args:
223
+ n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
224
+ n_iter (int, optional): Number of iteration for phase recovery process. (Default: ``32``)
225
+ win_length (int or None, optional): Window size. (Default: ``n_fft``)
226
+ hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
227
+ window_fn (Callable[..., Tensor], optional): A function to create a window tensor
228
+ that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
229
+ power (float, optional): Exponent for the magnitude spectrogram,
230
+ (must be > 0) e.g., 1 for magnitude, 2 for power, etc. (Default: ``2``)
231
+ wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
232
+ momentum (float, optional): The momentum parameter for fast Griffin-Lim.
233
+ Setting this to 0 recovers the original Griffin-Lim method.
234
+ Values near 1 can lead to faster convergence, but above 1 may not converge. (Default: ``0.99``)
235
+ length (int, optional): Array length of the expected output. (Default: ``None``)
236
+ rand_init (bool, optional): Initializes phase randomly if True and to zero otherwise. (Default: ``True``)
237
+
238
+ Example
239
+ >>> batch, freq, time = 2, 257, 100
240
+ >>> spectrogram = torch.randn(batch, freq, time)
241
+ >>> transform = transforms.GriffinLim(n_fft=512)
242
+ >>> waveform = transform(spectrogram)
243
+ """
244
+ __constants__ = ["n_fft", "n_iter", "win_length", "hop_length", "power", "length", "momentum", "rand_init"]
245
+
246
+ def __init__(
247
+ self,
248
+ n_fft: int = 400,
249
+ n_iter: int = 32,
250
+ win_length: Optional[int] = None,
251
+ hop_length: Optional[int] = None,
252
+ window_fn: Callable[..., Tensor] = torch.hann_window,
253
+ power: float = 2.0,
254
+ wkwargs: Optional[dict] = None,
255
+ momentum: float = 0.99,
256
+ length: Optional[int] = None,
257
+ rand_init: bool = True,
258
+ ) -> None:
259
+ super(GriffinLim, self).__init__()
260
+
261
+ if not (0 <= momentum < 1):
262
+ raise ValueError("momentum must be in the range [0, 1). Found: {}".format(momentum))
263
+
264
+ self.n_fft = n_fft
265
+ self.n_iter = n_iter
266
+ self.win_length = win_length if win_length is not None else n_fft
267
+ self.hop_length = hop_length if hop_length is not None else self.win_length // 2
268
+ window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
269
+ self.register_buffer("window", window)
270
+ self.length = length
271
+ self.power = power
272
+ self.momentum = momentum
273
+ self.rand_init = rand_init
274
+
275
+ def forward(self, specgram: Tensor) -> Tensor:
276
+ r"""
277
+ Args:
278
+ specgram (Tensor):
279
+ A magnitude-only STFT spectrogram of dimension (..., freq, frames)
280
+ where freq is ``n_fft // 2 + 1``.
281
+
282
+ Returns:
283
+ Tensor: waveform of (..., time), where time equals the ``length`` parameter if given.
284
+ """
285
+ return F.griffinlim(
286
+ specgram,
287
+ self.window,
288
+ self.n_fft,
289
+ self.hop_length,
290
+ self.win_length,
291
+ self.power,
292
+ self.n_iter,
293
+ self.momentum,
294
+ self.length,
295
+ self.rand_init,
296
+ )
297
+
298
+
299
+ class AmplitudeToDB(torch.nn.Module):
300
+ r"""Turn a tensor from the power/amplitude scale to the decibel scale.
301
+
302
+ .. devices:: CPU CUDA
303
+
304
+ .. properties:: Autograd TorchScript
305
+
306
+ This output depends on the maximum value in the input tensor, and so
307
+ may return different values for an audio clip split into snippets vs. a
308
+ a full clip.
309
+
310
+ Args:
311
+ stype (str, optional): scale of input tensor (``"power"`` or ``"magnitude"``). The
312
+ power being the elementwise square of the magnitude. (Default: ``"power"``)
313
+ top_db (float or None, optional): minimum negative cut-off in decibels. A reasonable
314
+ number is 80. (Default: ``None``)
315
+
316
+ Example
317
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
318
+ >>> transform = transforms.AmplitudeToDB(stype="amplitude", top_db=80)
319
+ >>> waveform_db = transform(waveform)
320
+ """
321
+ __constants__ = ["multiplier", "amin", "ref_value", "db_multiplier"]
322
+
323
+ def __init__(self, stype: str = "power", top_db: Optional[float] = None) -> None:
324
+ super(AmplitudeToDB, self).__init__()
325
+ self.stype = stype
326
+ if top_db is not None and top_db < 0:
327
+ raise ValueError("top_db must be positive value")
328
+ self.top_db = top_db
329
+ self.multiplier = 10.0 if stype == "power" else 20.0
330
+ self.amin = 1e-10
331
+ self.ref_value = 1.0
332
+ self.db_multiplier = math.log10(max(self.amin, self.ref_value))
333
+
334
+ def forward(self, x: Tensor) -> Tensor:
335
+ r"""Numerically stable implementation from Librosa.
336
+
337
+ https://librosa.org/doc/latest/generated/librosa.amplitude_to_db.html
338
+
339
+ Args:
340
+ x (Tensor): Input tensor before being converted to decibel scale.
341
+
342
+ Returns:
343
+ Tensor: Output tensor in decibel scale.
344
+ """
345
+ return F.amplitude_to_DB(x, self.multiplier, self.amin, self.db_multiplier, self.top_db)
346
+
347
+
348
+ class MelScale(torch.nn.Module):
349
+ r"""Turn a normal STFT into a mel frequency STFT with triangular filter banks.
350
+
351
+ .. devices:: CPU CUDA
352
+
353
+ .. properties:: Autograd TorchScript
354
+
355
+ Args:
356
+ n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
357
+ sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
358
+ f_min (float, optional): Minimum frequency. (Default: ``0.``)
359
+ f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
360
+ n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
361
+ norm (str or None, optional): If ``"slaney"``, divide the triangular mel weights by the width of the mel band
362
+ (area normalization). (Default: ``None``)
363
+ mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
364
+
365
+ Example
366
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
367
+ >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
368
+ >>> spectrogram = spectrogram_transform(waveform)
369
+ >>> melscale_transform = transforms.MelScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
370
+ >>> melscale_spectrogram = melscale_transform(spectrogram)
371
+
372
+ See also:
373
+ :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
374
+ generate the filter banks.
375
+ """
376
+ __constants__ = ["n_mels", "sample_rate", "f_min", "f_max"]
377
+
378
+ def __init__(
379
+ self,
380
+ n_mels: int = 128,
381
+ sample_rate: int = 16000,
382
+ f_min: float = 0.0,
383
+ f_max: Optional[float] = None,
384
+ n_stft: int = 201,
385
+ norm: Optional[str] = None,
386
+ mel_scale: str = "htk",
387
+ ) -> None:
388
+ super(MelScale, self).__init__()
389
+ self.n_mels = n_mels
390
+ self.sample_rate = sample_rate
391
+ self.f_max = f_max if f_max is not None else float(sample_rate // 2)
392
+ self.f_min = f_min
393
+ self.norm = norm
394
+ self.mel_scale = mel_scale
395
+
396
+ if f_min > self.f_max:
397
+ raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
398
+
399
+ fb = F.melscale_fbanks(n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, self.norm, self.mel_scale)
400
+ self.register_buffer("fb", fb)
401
+
402
+ def forward(self, specgram: Tensor) -> Tensor:
403
+ r"""
404
+ Args:
405
+ specgram (Tensor): A spectrogram STFT of dimension (..., freq, time).
406
+
407
+ Returns:
408
+ Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time).
409
+ """
410
+
411
+ # (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
412
+ mel_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2)
413
+
414
+ return mel_specgram
415
+
416
+
417
+ class InverseMelScale(torch.nn.Module):
418
+ r"""Estimate a STFT in normal frequency domain from mel frequency domain.
419
+
420
+ .. devices:: CPU CUDA
421
+
422
+ It minimizes the euclidian norm between the input mel-spectrogram and the product between
423
+ the estimated spectrogram and the filter banks using `torch.linalg.lstsq`.
424
+
425
+ Args:
426
+ n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
427
+ n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
428
+ sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
429
+ f_min (float, optional): Minimum frequency. (Default: ``0.``)
430
+ f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
431
+ norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
432
+ (area normalization). (Default: ``None``)
433
+ mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
434
+ driver (str, optional): Name of the LAPACK/MAGMA method to be used for `torch.lstsq`.
435
+ For CPU inputs the valid values are ``"gels"``, ``"gelsy"``, ``"gelsd"``, ``"gelss"``.
436
+ For CUDA input, the only valid driver is ``"gels"``, which assumes that A is full-rank.
437
+ (Default: ``"gels``)
438
+
439
+ Example
440
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
441
+ >>> mel_spectrogram_transform = transforms.MelSpectrogram(sample_rate, n_fft=1024)
442
+ >>> mel_spectrogram = mel_spectrogram_transform(waveform)
443
+ >>> inverse_melscale_transform = transforms.InverseMelScale(n_stft=1024 // 2 + 1)
444
+ >>> spectrogram = inverse_melscale_transform(mel_spectrogram)
445
+ """
446
+ __constants__ = [
447
+ "n_stft",
448
+ "n_mels",
449
+ "sample_rate",
450
+ "f_min",
451
+ "f_max",
452
+ ]
453
+
454
+ def __init__(
455
+ self,
456
+ n_stft: int,
457
+ n_mels: int = 128,
458
+ sample_rate: int = 16000,
459
+ f_min: float = 0.0,
460
+ f_max: Optional[float] = None,
461
+ norm: Optional[str] = None,
462
+ mel_scale: str = "htk",
463
+ driver: str = "gels",
464
+ ) -> None:
465
+ super(InverseMelScale, self).__init__()
466
+ self.n_mels = n_mels
467
+ self.sample_rate = sample_rate
468
+ self.f_max = f_max or float(sample_rate // 2)
469
+ self.f_min = f_min
470
+ self.driver = driver
471
+
472
+ if f_min > self.f_max:
473
+ raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
474
+
475
+ if driver not in ["gels", "gelsy", "gelsd", "gelss"]:
476
+ raise ValueError(f'driver must be one of ["gels", "gelsy", "gelsd", "gelss"]. Found {driver}.')
477
+
478
+ fb = F.melscale_fbanks(n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, norm, mel_scale)
479
+ self.register_buffer("fb", fb)
480
+
481
+ def forward(self, melspec: Tensor) -> Tensor:
482
+ r"""
483
+ Args:
484
+ melspec (Tensor): A Mel frequency spectrogram of dimension (..., ``n_mels``, time)
485
+
486
+ Returns:
487
+ Tensor: Linear scale spectrogram of size (..., freq, time)
488
+ """
489
+ # pack batch
490
+ shape = melspec.size()
491
+ melspec = melspec.view(-1, shape[-2], shape[-1])
492
+
493
+ n_mels, time = shape[-2], shape[-1]
494
+ freq, _ = self.fb.size() # (freq, n_mels)
495
+ if self.n_mels != n_mels:
496
+ raise ValueError("Expected an input with {} mel bins. Found: {}".format(self.n_mels, n_mels))
497
+
498
+ specgram = torch.relu(torch.linalg.lstsq(self.fb.transpose(-1, -2)[None], melspec, driver=self.driver).solution)
499
+
500
+ # unpack batch
501
+ specgram = specgram.view(shape[:-2] + (freq, time))
502
+ return specgram
503
+
504
+
505
+ class MelSpectrogram(torch.nn.Module):
506
+ r"""Create MelSpectrogram for a raw audio signal.
507
+
508
+ .. devices:: CPU CUDA
509
+
510
+ .. properties:: Autograd TorchScript
511
+
512
+ This is a composition of :py:func:`torchaudio.transforms.Spectrogram`
513
+ and :py:func:`torchaudio.transforms.MelScale`.
514
+
515
+ Sources
516
+ * https://gist.github.com/kastnerkyle/179d6e9a88202ab0a2fe
517
+ * https://timsainb.github.io/spectrograms-mfccs-and-inversion-in-python.html
518
+ * http://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
519
+
520
+ Args:
521
+ sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
522
+ n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
523
+ win_length (int or None, optional): Window size. (Default: ``n_fft``)
524
+ hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
525
+ f_min (float, optional): Minimum frequency. (Default: ``0.``)
526
+ f_max (float or None, optional): Maximum frequency. (Default: ``None``)
527
+ pad (int, optional): Two sided padding of signal. (Default: ``0``)
528
+ n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
529
+ window_fn (Callable[..., Tensor], optional): A function to create a window tensor
530
+ that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
531
+ power (float, optional): Exponent for the magnitude spectrogram,
532
+ (must be > 0) e.g., 1 for magnitude, 2 for power, etc. (Default: ``2``)
533
+ normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
534
+ wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
535
+ center (bool, optional): whether to pad :attr:`waveform` on both sides so
536
+ that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
537
+ (Default: ``True``)
538
+ pad_mode (string, optional): controls the padding method used when
539
+ :attr:`center` is ``True``. (Default: ``"reflect"``)
540
+ onesided: Deprecated and unused.
541
+ norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
542
+ (area normalization). (Default: ``None``)
543
+ mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
544
+
545
+ Example
546
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
547
+ >>> transform = transforms.MelSpectrogram(sample_rate)
548
+ >>> mel_specgram = transform(waveform) # (channel, n_mels, time)
549
+
550
+ See also:
551
+ :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
552
+ generate the filter banks.
553
+ """
554
+ __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_mels", "f_min"]
555
+
556
+ def __init__(
557
+ self,
558
+ sample_rate: int = 16000,
559
+ n_fft: int = 400,
560
+ win_length: Optional[int] = None,
561
+ hop_length: Optional[int] = None,
562
+ f_min: float = 0.0,
563
+ f_max: Optional[float] = None,
564
+ pad: int = 0,
565
+ n_mels: int = 128,
566
+ window_fn: Callable[..., Tensor] = torch.hann_window,
567
+ power: float = 2.0,
568
+ normalized: bool = False,
569
+ wkwargs: Optional[dict] = None,
570
+ center: bool = True,
571
+ pad_mode: str = "reflect",
572
+ onesided: Optional[bool] = None,
573
+ norm: Optional[str] = None,
574
+ mel_scale: str = "htk",
575
+ ) -> None:
576
+ super(MelSpectrogram, self).__init__()
577
+ torch._C._log_api_usage_once("torchaudio.transforms.MelSpectrogram")
578
+
579
+ if onesided is not None:
580
+ warnings.warn(
581
+ "Argument 'onesided' has been deprecated and has no influence on the behavior of this module."
582
+ )
583
+
584
+ self.sample_rate = sample_rate
585
+ self.n_fft = n_fft
586
+ self.win_length = win_length if win_length is not None else n_fft
587
+ self.hop_length = hop_length if hop_length is not None else self.win_length // 2
588
+ self.pad = pad
589
+ self.power = power
590
+ self.normalized = normalized
591
+ self.n_mels = n_mels # number of mel frequency bins
592
+ self.f_max = f_max
593
+ self.f_min = f_min
594
+ self.spectrogram = Spectrogram(
595
+ n_fft=self.n_fft,
596
+ win_length=self.win_length,
597
+ hop_length=self.hop_length,
598
+ pad=self.pad,
599
+ window_fn=window_fn,
600
+ power=self.power,
601
+ normalized=self.normalized,
602
+ wkwargs=wkwargs,
603
+ center=center,
604
+ pad_mode=pad_mode,
605
+ onesided=True,
606
+ )
607
+ self.mel_scale = MelScale(
608
+ self.n_mels, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, norm, mel_scale
609
+ )
610
+
611
+ def forward(self, waveform: Tensor) -> Tensor:
612
+ r"""
613
+ Args:
614
+ waveform (Tensor): Tensor of audio of dimension (..., time).
615
+
616
+ Returns:
617
+ Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time).
618
+ """
619
+ specgram = self.spectrogram(waveform)
620
+ mel_specgram = self.mel_scale(specgram)
621
+ return mel_specgram
622
+
623
+
624
+ class MFCC(torch.nn.Module):
625
+ r"""Create the Mel-frequency cepstrum coefficients from an audio signal.
626
+
627
+ .. devices:: CPU CUDA
628
+
629
+ .. properties:: Autograd TorchScript
630
+
631
+ By default, this calculates the MFCC on the DB-scaled Mel spectrogram.
632
+ This is not the textbook implementation, but is implemented here to
633
+ give consistency with librosa.
634
+
635
+ This output depends on the maximum value in the input spectrogram, and so
636
+ may return different values for an audio clip split into snippets vs. a
637
+ a full clip.
638
+
639
+ Args:
640
+ sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
641
+ n_mfcc (int, optional): Number of mfc coefficients to retain. (Default: ``40``)
642
+ dct_type (int, optional): type of DCT (discrete cosine transform) to use. (Default: ``2``)
643
+ norm (str, optional): norm to use. (Default: ``"ortho"``)
644
+ log_mels (bool, optional): whether to use log-mel spectrograms instead of db-scaled. (Default: ``False``)
645
+ melkwargs (dict or None, optional): arguments for MelSpectrogram. (Default: ``None``)
646
+
647
+ Example
648
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
649
+ >>> transform = transforms.MFCC(
650
+ >>> sample_rate=sample_rate,
651
+ >>> n_mfcc=13,
652
+ >>> melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 23, "center": False},
653
+ >>> )
654
+ >>> mfcc = transform(waveform)
655
+
656
+ See also:
657
+ :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
658
+ generate the filter banks.
659
+ """
660
+ __constants__ = ["sample_rate", "n_mfcc", "dct_type", "top_db", "log_mels"]
661
+
662
+ def __init__(
663
+ self,
664
+ sample_rate: int = 16000,
665
+ n_mfcc: int = 40,
666
+ dct_type: int = 2,
667
+ norm: str = "ortho",
668
+ log_mels: bool = False,
669
+ melkwargs: Optional[dict] = None,
670
+ ) -> None:
671
+ super(MFCC, self).__init__()
672
+ supported_dct_types = [2]
673
+ if dct_type not in supported_dct_types:
674
+ raise ValueError("DCT type not supported: {}".format(dct_type))
675
+ self.sample_rate = sample_rate
676
+ self.n_mfcc = n_mfcc
677
+ self.dct_type = dct_type
678
+ self.norm = norm
679
+ self.top_db = 80.0
680
+ self.amplitude_to_DB = AmplitudeToDB("power", self.top_db)
681
+
682
+ melkwargs = melkwargs or {}
683
+ self.MelSpectrogram = MelSpectrogram(sample_rate=self.sample_rate, **melkwargs)
684
+
685
+ if self.n_mfcc > self.MelSpectrogram.n_mels:
686
+ raise ValueError("Cannot select more MFCC coefficients than # mel bins")
687
+ dct_mat = F.create_dct(self.n_mfcc, self.MelSpectrogram.n_mels, self.norm)
688
+ self.register_buffer("dct_mat", dct_mat)
689
+ self.log_mels = log_mels
690
+
691
+ def forward(self, waveform: Tensor) -> Tensor:
692
+ r"""
693
+ Args:
694
+ waveform (Tensor): Tensor of audio of dimension (..., time).
695
+
696
+ Returns:
697
+ Tensor: specgram_mel_db of size (..., ``n_mfcc``, time).
698
+ """
699
+ mel_specgram = self.MelSpectrogram(waveform)
700
+ if self.log_mels:
701
+ log_offset = 1e-6
702
+ mel_specgram = torch.log(mel_specgram + log_offset)
703
+ else:
704
+ mel_specgram = self.amplitude_to_DB(mel_specgram)
705
+
706
+ # (..., time, n_mels) dot (n_mels, n_mfcc) -> (..., n_nfcc, time)
707
+ mfcc = torch.matmul(mel_specgram.transpose(-1, -2), self.dct_mat).transpose(-1, -2)
708
+ return mfcc
709
+
710
+
711
+ class LFCC(torch.nn.Module):
712
+ r"""Create the linear-frequency cepstrum coefficients from an audio signal.
713
+
714
+ .. devices:: CPU CUDA
715
+
716
+ .. properties:: Autograd TorchScript
717
+
718
+ By default, this calculates the LFCC on the DB-scaled linear filtered spectrogram.
719
+ This is not the textbook implementation, but is implemented here to
720
+ give consistency with librosa.
721
+
722
+ This output depends on the maximum value in the input spectrogram, and so
723
+ may return different values for an audio clip split into snippets vs. a
724
+ a full clip.
725
+
726
+ Args:
727
+ sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
728
+ n_filter (int, optional): Number of linear filters to apply. (Default: ``128``)
729
+ n_lfcc (int, optional): Number of lfc coefficients to retain. (Default: ``40``)
730
+ f_min (float, optional): Minimum frequency. (Default: ``0.``)
731
+ f_max (float or None, optional): Maximum frequency. (Default: ``None``)
732
+ dct_type (int, optional): type of DCT (discrete cosine transform) to use. (Default: ``2``)
733
+ norm (str, optional): norm to use. (Default: ``"ortho"``)
734
+ log_lf (bool, optional): whether to use log-lf spectrograms instead of db-scaled. (Default: ``False``)
735
+ speckwargs (dict or None, optional): arguments for Spectrogram. (Default: ``None``)
736
+
737
+ Example
738
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
739
+ >>> transform = transforms.LFCC(
740
+ >>> sample_rate=sample_rate,
741
+ >>> n_lfcc=13,
742
+ >>> speckwargs={"n_fft": 400, "hop_length": 160, "center": False},
743
+ >>> )
744
+ >>> lfcc = transform(waveform)
745
+
746
+ See also:
747
+ :py:func:`torchaudio.functional.linear_fbanks` - The function used to
748
+ generate the filter banks.
749
+ """
750
+ __constants__ = ["sample_rate", "n_filter", "n_lfcc", "dct_type", "top_db", "log_lf"]
751
+
752
+ def __init__(
753
+ self,
754
+ sample_rate: int = 16000,
755
+ n_filter: int = 128,
756
+ f_min: float = 0.0,
757
+ f_max: Optional[float] = None,
758
+ n_lfcc: int = 40,
759
+ dct_type: int = 2,
760
+ norm: str = "ortho",
761
+ log_lf: bool = False,
762
+ speckwargs: Optional[dict] = None,
763
+ ) -> None:
764
+ super(LFCC, self).__init__()
765
+ supported_dct_types = [2]
766
+ if dct_type not in supported_dct_types:
767
+ raise ValueError("DCT type not supported: {}".format(dct_type))
768
+ self.sample_rate = sample_rate
769
+ self.f_min = f_min
770
+ self.f_max = f_max if f_max is not None else float(sample_rate // 2)
771
+ self.n_filter = n_filter
772
+ self.n_lfcc = n_lfcc
773
+ self.dct_type = dct_type
774
+ self.norm = norm
775
+ self.top_db = 80.0
776
+ self.amplitude_to_DB = AmplitudeToDB("power", self.top_db)
777
+
778
+ speckwargs = speckwargs or {}
779
+ self.Spectrogram = Spectrogram(**speckwargs)
780
+
781
+ if self.n_lfcc > self.Spectrogram.n_fft:
782
+ raise ValueError("Cannot select more LFCC coefficients than # fft bins")
783
+
784
+ filter_mat = F.linear_fbanks(
785
+ n_freqs=self.Spectrogram.n_fft // 2 + 1,
786
+ f_min=self.f_min,
787
+ f_max=self.f_max,
788
+ n_filter=self.n_filter,
789
+ sample_rate=self.sample_rate,
790
+ )
791
+ self.register_buffer("filter_mat", filter_mat)
792
+
793
+ dct_mat = F.create_dct(self.n_lfcc, self.n_filter, self.norm)
794
+ self.register_buffer("dct_mat", dct_mat)
795
+ self.log_lf = log_lf
796
+
797
+ def forward(self, waveform: Tensor) -> Tensor:
798
+ r"""
799
+ Args:
800
+ waveform (Tensor): Tensor of audio of dimension (..., time).
801
+
802
+ Returns:
803
+ Tensor: Linear Frequency Cepstral Coefficients of size (..., ``n_lfcc``, time).
804
+ """
805
+ specgram = self.Spectrogram(waveform)
806
+
807
+ # (..., time, freq) dot (freq, n_filter) -> (..., n_filter, time)
808
+ specgram = torch.matmul(specgram.transpose(-1, -2), self.filter_mat).transpose(-1, -2)
809
+
810
+ if self.log_lf:
811
+ log_offset = 1e-6
812
+ specgram = torch.log(specgram + log_offset)
813
+ else:
814
+ specgram = self.amplitude_to_DB(specgram)
815
+
816
+ # (..., time, n_filter) dot (n_filter, n_lfcc) -> (..., n_lfcc, time)
817
+ lfcc = torch.matmul(specgram.transpose(-1, -2), self.dct_mat).transpose(-1, -2)
818
+ return lfcc
819
+
820
+
821
+ class MuLawEncoding(torch.nn.Module):
822
+ r"""Encode signal based on mu-law companding.
823
+
824
+ .. devices:: CPU CUDA
825
+
826
+ .. properties:: TorchScript
827
+
828
+ For more info see the
829
+ `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
830
+
831
+ This algorithm assumes the signal has been scaled to between -1 and 1 and
832
+ returns a signal encoded with values from 0 to quantization_channels - 1
833
+
834
+ Args:
835
+ quantization_channels (int, optional): Number of channels. (Default: ``256``)
836
+
837
+ Example
838
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
839
+ >>> transform = torchaudio.transforms.MuLawEncoding(quantization_channels=512)
840
+ >>> mulawtrans = transform(waveform)
841
+
842
+ """
843
+ __constants__ = ["quantization_channels"]
844
+
845
+ def __init__(self, quantization_channels: int = 256) -> None:
846
+ super(MuLawEncoding, self).__init__()
847
+ self.quantization_channels = quantization_channels
848
+
849
+ def forward(self, x: Tensor) -> Tensor:
850
+ r"""
851
+ Args:
852
+ x (Tensor): A signal to be encoded.
853
+
854
+ Returns:
855
+ Tensor: An encoded signal.
856
+ """
857
+ return F.mu_law_encoding(x, self.quantization_channels)
858
+
859
+
860
+ class MuLawDecoding(torch.nn.Module):
861
+ r"""Decode mu-law encoded signal.
862
+
863
+ .. devices:: CPU CUDA
864
+
865
+ .. properties:: TorchScript
866
+
867
+ For more info see the
868
+ `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
869
+
870
+ This expects an input with values between 0 and ``quantization_channels - 1``
871
+ and returns a signal scaled between -1 and 1.
872
+
873
+ Args:
874
+ quantization_channels (int, optional): Number of channels. (Default: ``256``)
875
+
876
+ Example
877
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
878
+ >>> transform = torchaudio.transforms.MuLawDecoding(quantization_channels=512)
879
+ >>> mulawtrans = transform(waveform)
880
+ """
881
+ __constants__ = ["quantization_channels"]
882
+
883
+ def __init__(self, quantization_channels: int = 256) -> None:
884
+ super(MuLawDecoding, self).__init__()
885
+ self.quantization_channels = quantization_channels
886
+
887
+ def forward(self, x_mu: Tensor) -> Tensor:
888
+ r"""
889
+ Args:
890
+ x_mu (Tensor): A mu-law encoded signal which needs to be decoded.
891
+
892
+ Returns:
893
+ Tensor: The signal decoded.
894
+ """
895
+ return F.mu_law_decoding(x_mu, self.quantization_channels)
896
+
897
+
898
+ class Resample(torch.nn.Module):
899
+ r"""Resample a signal from one frequency to another. A resampling method can be given.
900
+
901
+ .. devices:: CPU CUDA
902
+
903
+ .. properties:: Autograd TorchScript
904
+
905
+ Note:
906
+ If resampling on waveforms of higher precision than float32, there may be a small loss of precision
907
+ because the kernel is cached once as float32. If high precision resampling is important for your application,
908
+ the functional form will retain higher precision, but run slower because it does not cache the kernel.
909
+ Alternatively, you could rewrite a transform that caches a higher precision kernel.
910
+
911
+ Args:
912
+ orig_freq (int, optional): The original frequency of the signal. (Default: ``16000``)
913
+ new_freq (int, optional): The desired frequency. (Default: ``16000``)
914
+ resampling_method (str, optional): The resampling method to use.
915
+ Options: [``sinc_interp_hann``, ``sinc_interp_kaiser``] (Default: ``"sinc_interp_hann"``)
916
+ lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper
917
+ but less efficient. (Default: ``6``)
918
+ rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist.
919
+ Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``)
920
+ beta (float or None, optional): The shape parameter used for kaiser window.
921
+ dtype (torch.device, optional):
922
+ Determnines the precision that resampling kernel is pre-computed and cached. If not provided,
923
+ kernel is computed with ``torch.float64`` then cached as ``torch.float32``.
924
+ If you need higher precision, provide ``torch.float64``, and the pre-computed kernel is computed and
925
+ cached as ``torch.float64``. If you use resample with lower precision, then instead of providing this
926
+ providing this argument, please use ``Resample.to(dtype)``, so that the kernel generation is still
927
+ carried out on ``torch.float64``.
928
+
929
+ Example
930
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
931
+ >>> transform = transforms.Resample(sample_rate, sample_rate/10)
932
+ >>> waveform = transform(waveform)
933
+ """
934
+
935
+ def __init__(
936
+ self,
937
+ orig_freq: int = 16000,
938
+ new_freq: int = 16000,
939
+ resampling_method: str = "sinc_interp_hann",
940
+ lowpass_filter_width: int = 6,
941
+ rolloff: float = 0.99,
942
+ beta: Optional[float] = None,
943
+ *,
944
+ dtype: Optional[torch.dtype] = None,
945
+ ) -> None:
946
+ super().__init__()
947
+
948
+ self.orig_freq = orig_freq
949
+ self.new_freq = new_freq
950
+ self.gcd = math.gcd(int(self.orig_freq), int(self.new_freq))
951
+ self.resampling_method = resampling_method
952
+ self.lowpass_filter_width = lowpass_filter_width
953
+ self.rolloff = rolloff
954
+ self.beta = beta
955
+
956
+ if self.orig_freq != self.new_freq:
957
+ kernel, self.width = _get_sinc_resample_kernel(
958
+ self.orig_freq,
959
+ self.new_freq,
960
+ self.gcd,
961
+ self.lowpass_filter_width,
962
+ self.rolloff,
963
+ self.resampling_method,
964
+ beta,
965
+ dtype=dtype,
966
+ )
967
+ self.register_buffer("kernel", kernel)
968
+
969
+ def forward(self, waveform: Tensor) -> Tensor:
970
+ r"""
971
+ Args:
972
+ waveform (Tensor): Tensor of audio of dimension (..., time).
973
+
974
+ Returns:
975
+ Tensor: Output signal of dimension (..., time).
976
+ """
977
+ if self.orig_freq == self.new_freq:
978
+ return waveform
979
+ return _apply_sinc_resample_kernel(waveform, self.orig_freq, self.new_freq, self.gcd, self.kernel, self.width)
980
+
981
+
982
+ class ComputeDeltas(torch.nn.Module):
983
+ r"""Compute delta coefficients of a tensor, usually a spectrogram.
984
+
985
+ .. devices:: CPU CUDA
986
+
987
+ .. properties:: Autograd TorchScript
988
+
989
+ See `torchaudio.functional.compute_deltas` for more details.
990
+
991
+ Args:
992
+ win_length (int, optional): The window length used for computing delta. (Default: ``5``)
993
+ mode (str, optional): Mode parameter passed to padding. (Default: ``"replicate"``)
994
+ """
995
+ __constants__ = ["win_length"]
996
+
997
+ def __init__(self, win_length: int = 5, mode: str = "replicate") -> None:
998
+ super(ComputeDeltas, self).__init__()
999
+ self.win_length = win_length
1000
+ self.mode = mode
1001
+
1002
+ def forward(self, specgram: Tensor) -> Tensor:
1003
+ r"""
1004
+ Args:
1005
+ specgram (Tensor): Tensor of audio of dimension (..., freq, time).
1006
+
1007
+ Returns:
1008
+ Tensor: Tensor of deltas of dimension (..., freq, time).
1009
+ """
1010
+ return F.compute_deltas(specgram, win_length=self.win_length, mode=self.mode)
1011
+
1012
+
1013
+ class TimeStretch(torch.nn.Module):
1014
+ r"""Stretch stft in time without modifying pitch for a given rate.
1015
+
1016
+ .. devices:: CPU CUDA
1017
+
1018
+ .. properties:: Autograd TorchScript
1019
+
1020
+ Proposed in *SpecAugment* :cite:`specaugment`.
1021
+
1022
+ Args:
1023
+ hop_length (int or None, optional): Length of hop between STFT windows.
1024
+ (Default: ``n_fft // 2``, where ``n_fft == (n_freq - 1) * 2``)
1025
+ n_freq (int, optional): number of filter banks from stft. (Default: ``201``)
1026
+ fixed_rate (float or None, optional): rate to speed up or slow down by.
1027
+ If None is provided, rate must be passed to the forward method. (Default: ``None``)
1028
+
1029
+ .. note::
1030
+
1031
+ The expected input is raw, complex-valued spectrogram.
1032
+
1033
+ Example
1034
+ >>> spectrogram = torchaudio.transforms.Spectrogram(power=None)
1035
+ >>> stretch = torchaudio.transforms.TimeStretch()
1036
+ >>>
1037
+ >>> original = spectrogram(waveform)
1038
+ >>> stretched_1_2 = stretch(original, 1.2)
1039
+ >>> stretched_0_9 = stretch(original, 0.9)
1040
+
1041
+ .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch.png
1042
+ :width: 600
1043
+ :alt: The visualization of stretched spectrograms.
1044
+ """
1045
+ __constants__ = ["fixed_rate"]
1046
+
1047
+ def __init__(self, hop_length: Optional[int] = None, n_freq: int = 201, fixed_rate: Optional[float] = None) -> None:
1048
+ super(TimeStretch, self).__init__()
1049
+
1050
+ self.fixed_rate = fixed_rate
1051
+
1052
+ n_fft = (n_freq - 1) * 2
1053
+ hop_length = hop_length if hop_length is not None else n_fft // 2
1054
+ self.register_buffer("phase_advance", torch.linspace(0, math.pi * hop_length, n_freq)[..., None])
1055
+
1056
+ def forward(self, complex_specgrams: Tensor, overriding_rate: Optional[float] = None) -> Tensor:
1057
+ r"""
1058
+ Args:
1059
+ complex_specgrams (Tensor):
1060
+ A tensor of dimension `(..., freq, num_frame)` with complex dtype.
1061
+ overriding_rate (float or None, optional): speed up to apply to this batch.
1062
+ If no rate is passed, use ``self.fixed_rate``. (Default: ``None``)
1063
+
1064
+ Returns:
1065
+ Tensor:
1066
+ Stretched spectrogram. The resulting tensor is of the corresponding complex dtype
1067
+ as the input spectrogram, and the number of frames is changed to ``ceil(num_frame / rate)``.
1068
+ """
1069
+ if not torch.is_complex(complex_specgrams):
1070
+ warnings.warn(
1071
+ "The input to TimeStretch must be complex type. "
1072
+ "Providing non-complex tensor produces invalid results.",
1073
+ stacklevel=4,
1074
+ )
1075
+
1076
+ if overriding_rate is None:
1077
+ if self.fixed_rate is None:
1078
+ raise ValueError("If no fixed_rate is specified, must pass a valid rate to the forward method.")
1079
+ rate = self.fixed_rate
1080
+ else:
1081
+ rate = overriding_rate
1082
+ return F.phase_vocoder(complex_specgrams, rate, self.phase_advance)
1083
+
1084
+
1085
+ class Fade(torch.nn.Module):
1086
+ r"""Add a fade in and/or fade out to an waveform.
1087
+
1088
+ .. devices:: CPU CUDA
1089
+
1090
+ .. properties:: Autograd TorchScript
1091
+
1092
+ Args:
1093
+ fade_in_len (int, optional): Length of fade-in (time frames). (Default: ``0``)
1094
+ fade_out_len (int, optional): Length of fade-out (time frames). (Default: ``0``)
1095
+ fade_shape (str, optional): Shape of fade. Must be one of: "quarter_sine",
1096
+ ``"half_sine"``, ``"linear"``, ``"logarithmic"``, ``"exponential"``.
1097
+ (Default: ``"linear"``)
1098
+
1099
+ Example
1100
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
1101
+ >>> transform = transforms.Fade(fade_in_len=sample_rate, fade_out_len=2 * sample_rate, fade_shape="linear")
1102
+ >>> faded_waveform = transform(waveform)
1103
+ """
1104
+
1105
+ def __init__(self, fade_in_len: int = 0, fade_out_len: int = 0, fade_shape: str = "linear") -> None:
1106
+ super(Fade, self).__init__()
1107
+ self.fade_in_len = fade_in_len
1108
+ self.fade_out_len = fade_out_len
1109
+ self.fade_shape = fade_shape
1110
+
1111
+ def forward(self, waveform: Tensor) -> Tensor:
1112
+ r"""
1113
+ Args:
1114
+ waveform (Tensor): Tensor of audio of dimension `(..., time)`.
1115
+
1116
+ Returns:
1117
+ Tensor: Tensor of audio of dimension `(..., time)`.
1118
+ """
1119
+ waveform_length = waveform.size()[-1]
1120
+ device = waveform.device
1121
+ return self._fade_in(waveform_length, device) * self._fade_out(waveform_length, device) * waveform
1122
+
1123
+ def _fade_in(self, waveform_length: int, device: torch.device) -> Tensor:
1124
+ fade = torch.linspace(0, 1, self.fade_in_len, device=device)
1125
+ ones = torch.ones(waveform_length - self.fade_in_len, device=device)
1126
+
1127
+ if self.fade_shape == "linear":
1128
+ fade = fade
1129
+
1130
+ if self.fade_shape == "exponential":
1131
+ fade = torch.pow(2, (fade - 1)) * fade
1132
+
1133
+ if self.fade_shape == "logarithmic":
1134
+ fade = torch.log10(0.1 + fade) + 1
1135
+
1136
+ if self.fade_shape == "quarter_sine":
1137
+ fade = torch.sin(fade * math.pi / 2)
1138
+
1139
+ if self.fade_shape == "half_sine":
1140
+ fade = torch.sin(fade * math.pi - math.pi / 2) / 2 + 0.5
1141
+
1142
+ return torch.cat((fade, ones)).clamp_(0, 1)
1143
+
1144
+ def _fade_out(self, waveform_length: int, device: torch.device) -> Tensor:
1145
+ fade = torch.linspace(0, 1, self.fade_out_len, device=device)
1146
+ ones = torch.ones(waveform_length - self.fade_out_len, device=device)
1147
+
1148
+ if self.fade_shape == "linear":
1149
+ fade = -fade + 1
1150
+
1151
+ if self.fade_shape == "exponential":
1152
+ fade = torch.pow(2, -fade) * (1 - fade)
1153
+
1154
+ if self.fade_shape == "logarithmic":
1155
+ fade = torch.log10(1.1 - fade) + 1
1156
+
1157
+ if self.fade_shape == "quarter_sine":
1158
+ fade = torch.sin(fade * math.pi / 2 + math.pi / 2)
1159
+
1160
+ if self.fade_shape == "half_sine":
1161
+ fade = torch.sin(fade * math.pi + math.pi / 2) / 2 + 0.5
1162
+
1163
+ return torch.cat((ones, fade)).clamp_(0, 1)
1164
+
1165
+
1166
+ class _AxisMasking(torch.nn.Module):
1167
+ r"""Apply masking to a spectrogram.
1168
+
1169
+ Args:
1170
+ mask_param (int): Maximum possible length of the mask.
1171
+ axis (int): What dimension the mask is applied on (assuming the tensor is 3D).
1172
+ For frequency masking, axis = 1.
1173
+ For time masking, axis = 2.
1174
+ iid_masks (bool): Applies iid masks to each of the examples in the batch dimension.
1175
+ This option is applicable only when the dimension of the input tensor is >= 3.
1176
+ p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
1177
+ """
1178
+ __constants__ = ["mask_param", "axis", "iid_masks", "p"]
1179
+
1180
+ def __init__(self, mask_param: int, axis: int, iid_masks: bool, p: float = 1.0) -> None:
1181
+ super(_AxisMasking, self).__init__()
1182
+ self.mask_param = mask_param
1183
+ self.axis = axis
1184
+ self.iid_masks = iid_masks
1185
+ self.p = p
1186
+
1187
+ def forward(self, specgram: Tensor, mask_value: float = 0.0) -> Tensor:
1188
+ r"""
1189
+ Args:
1190
+ specgram (Tensor): Tensor of dimension `(..., freq, time)`.
1191
+ mask_value (float): Value to assign to the masked columns.
1192
+
1193
+ Returns:
1194
+ Tensor: Masked spectrogram of dimensions `(..., freq, time)`.
1195
+ """
1196
+ # if iid_masks flag marked and specgram has a batch dimension
1197
+ # self.axis + specgram.dim() - 3 gives the time/frequency dimension (last two dimensions)
1198
+ # for input tensor for which the dimension is not 3.
1199
+ if self.iid_masks:
1200
+ return F.mask_along_axis_iid(
1201
+ specgram, self.mask_param, mask_value, self.axis + specgram.dim() - 3, p=self.p
1202
+ )
1203
+ else:
1204
+ return F.mask_along_axis(specgram, self.mask_param, mask_value, self.axis + specgram.dim() - 3, p=self.p)
1205
+
1206
+
1207
+ class FrequencyMasking(_AxisMasking):
1208
+ r"""Apply masking to a spectrogram in the frequency domain.
1209
+
1210
+ .. devices:: CPU CUDA
1211
+
1212
+ .. properties:: Autograd TorchScript
1213
+
1214
+ Proposed in *SpecAugment* :cite:`specaugment`.
1215
+
1216
+ Args:
1217
+ freq_mask_param (int): maximum possible length of the mask.
1218
+ Indices uniformly sampled from [0, freq_mask_param).
1219
+ iid_masks (bool, optional): whether to apply different masks to each
1220
+ example/channel in the batch. (Default: ``False``)
1221
+ This option is applicable only when the input tensor >= 3D.
1222
+
1223
+ Example
1224
+ >>> spectrogram = torchaudio.transforms.Spectrogram()
1225
+ >>> masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
1226
+ >>>
1227
+ >>> original = spectrogram(waveform)
1228
+ >>> masked = masking(original)
1229
+
1230
+ .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_freq_masking1.png
1231
+ :alt: The original spectrogram
1232
+
1233
+ .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_freq_masking2.png
1234
+ :alt: The spectrogram masked along frequency axis
1235
+ """
1236
+
1237
+ def __init__(self, freq_mask_param: int, iid_masks: bool = False) -> None:
1238
+ super(FrequencyMasking, self).__init__(freq_mask_param, 1, iid_masks)
1239
+
1240
+
1241
+ class TimeMasking(_AxisMasking):
1242
+ r"""Apply masking to a spectrogram in the time domain.
1243
+
1244
+ .. devices:: CPU CUDA
1245
+
1246
+ .. properties:: Autograd TorchScript
1247
+
1248
+ Proposed in *SpecAugment* :cite:`specaugment`.
1249
+
1250
+ Args:
1251
+ time_mask_param (int): maximum possible length of the mask.
1252
+ Indices uniformly sampled from [0, time_mask_param).
1253
+ iid_masks (bool, optional): whether to apply different masks to each
1254
+ example/channel in the batch. (Default: ``False``)
1255
+ This option is applicable only when the input tensor >= 3D.
1256
+ p (float, optional): maximum proportion of time steps that can be masked.
1257
+ Must be within range [0.0, 1.0]. (Default: 1.0)
1258
+
1259
+ Example
1260
+ >>> spectrogram = torchaudio.transforms.Spectrogram()
1261
+ >>> masking = torchaudio.transforms.TimeMasking(time_mask_param=80)
1262
+ >>>
1263
+ >>> original = spectrogram(waveform)
1264
+ >>> masked = masking(original)
1265
+
1266
+ .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_masking1.png
1267
+ :alt: The original spectrogram
1268
+
1269
+ .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_masking2.png
1270
+ :alt: The spectrogram masked along time axis
1271
+ """
1272
+
1273
+ def __init__(self, time_mask_param: int, iid_masks: bool = False, p: float = 1.0) -> None:
1274
+ if not 0.0 <= p <= 1.0:
1275
+ raise ValueError(f"The value of p must be between 0.0 and 1.0 ({p} given).")
1276
+ super(TimeMasking, self).__init__(time_mask_param, 2, iid_masks, p=p)
1277
+
1278
+
1279
+ class SpecAugment(torch.nn.Module):
1280
+ r"""Apply time and frequency masking to a spectrogram.
1281
+ Args:
1282
+ n_time_masks (int): Number of time masks. If its value is zero, no time masking will be applied.
1283
+ time_mask_param (int): Maximum possible length of the time mask.
1284
+ n_freq_masks (int): Number of frequency masks. If its value is zero, no frequency masking will be applied.
1285
+ freq_mask_param (int): Maximum possible length of the frequency mask.
1286
+ iid_masks (bool, optional): Applies iid masks to each of the examples in the batch dimension.
1287
+ This option is applicable only when the input tensor is 4D. (Default: ``True``)
1288
+ p (float, optional): maximum proportion of time steps that can be masked.
1289
+ Must be within range [0.0, 1.0]. (Default: 1.0)
1290
+ zero_masking (bool, optional): If ``True``, use 0 as the mask value,
1291
+ else use mean of the input tensor. (Default: ``False``)
1292
+ """
1293
+ __constants__ = [
1294
+ "n_time_masks",
1295
+ "time_mask_param",
1296
+ "n_freq_masks",
1297
+ "freq_mask_param",
1298
+ "iid_masks",
1299
+ "p",
1300
+ "zero_masking",
1301
+ ]
1302
+
1303
+ def __init__(
1304
+ self,
1305
+ n_time_masks: int,
1306
+ time_mask_param: int,
1307
+ n_freq_masks: int,
1308
+ freq_mask_param: int,
1309
+ iid_masks: bool = True,
1310
+ p: float = 1.0,
1311
+ zero_masking: bool = False,
1312
+ ) -> None:
1313
+ super(SpecAugment, self).__init__()
1314
+ self.n_time_masks = n_time_masks
1315
+ self.time_mask_param = time_mask_param
1316
+ self.n_freq_masks = n_freq_masks
1317
+ self.freq_mask_param = freq_mask_param
1318
+ self.iid_masks = iid_masks
1319
+ self.p = p
1320
+ self.zero_masking = zero_masking
1321
+
1322
+ def forward(self, specgram: Tensor) -> Tensor:
1323
+ r"""
1324
+ Args:
1325
+ specgram (Tensor): Tensor of shape `(..., freq, time)`.
1326
+ Returns:
1327
+ Tensor: Masked spectrogram of shape `(..., freq, time)`.
1328
+ """
1329
+ if self.zero_masking:
1330
+ mask_value = 0.0
1331
+ else:
1332
+ mask_value = specgram.mean()
1333
+ time_dim = specgram.dim() - 1
1334
+ freq_dim = time_dim - 1
1335
+
1336
+ if specgram.dim() > 2 and self.iid_masks is True:
1337
+ for _ in range(self.n_time_masks):
1338
+ specgram = F.mask_along_axis_iid(specgram, self.time_mask_param, mask_value, time_dim, p=self.p)
1339
+ for _ in range(self.n_freq_masks):
1340
+ specgram = F.mask_along_axis_iid(specgram, self.freq_mask_param, mask_value, freq_dim, p=self.p)
1341
+ else:
1342
+ for _ in range(self.n_time_masks):
1343
+ specgram = F.mask_along_axis(specgram, self.time_mask_param, mask_value, time_dim, p=self.p)
1344
+ for _ in range(self.n_freq_masks):
1345
+ specgram = F.mask_along_axis(specgram, self.freq_mask_param, mask_value, freq_dim, p=self.p)
1346
+
1347
+ return specgram
1348
+
1349
+
1350
+ class Loudness(torch.nn.Module):
1351
+ r"""Measure audio loudness according to the ITU-R BS.1770-4 recommendation.
1352
+
1353
+ .. devices:: CPU CUDA
1354
+
1355
+ .. properties:: TorchScript
1356
+
1357
+ Args:
1358
+ sample_rate (int): Sample rate of audio signal.
1359
+
1360
+ Example
1361
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
1362
+ >>> transform = transforms.Loudness(sample_rate)
1363
+ >>> loudness = transform(waveform)
1364
+
1365
+ Reference:
1366
+ - https://www.itu.int/rec/R-REC-BS.1770-4-201510-I/en
1367
+ """
1368
+ __constants__ = ["sample_rate"]
1369
+
1370
+ def __init__(self, sample_rate: int):
1371
+ super(Loudness, self).__init__()
1372
+ self.sample_rate = sample_rate
1373
+
1374
+ def forward(self, wavefrom: Tensor):
1375
+ r"""
1376
+ Args:
1377
+ waveform(torch.Tensor): audio waveform of dimension `(..., channels, time)`
1378
+
1379
+ Returns:
1380
+ Tensor: loudness estimates (LKFS)
1381
+ """
1382
+ return F.loudness(wavefrom, self.sample_rate)
1383
+
1384
+
1385
+ class Vol(torch.nn.Module):
1386
+ r"""Adjust volume of waveform.
1387
+
1388
+ .. devices:: CPU CUDA
1389
+
1390
+ .. properties:: Autograd TorchScript
1391
+
1392
+ Args:
1393
+ gain (float): Interpreted according to the given gain_type:
1394
+ If ``gain_type`` = ``amplitude``, ``gain`` is a positive amplitude ratio.
1395
+ If ``gain_type`` = ``power``, ``gain`` is a power (voltage squared).
1396
+ If ``gain_type`` = ``db``, ``gain`` is in decibels.
1397
+ gain_type (str, optional): Type of gain. One of: ``amplitude``, ``power``, ``db`` (Default: ``amplitude``)
1398
+
1399
+ Example
1400
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
1401
+ >>> transform = transforms.Vol(gain=0.5, gain_type="amplitude")
1402
+ >>> quieter_waveform = transform(waveform)
1403
+ """
1404
+
1405
+ def __init__(self, gain: float, gain_type: str = "amplitude"):
1406
+ super(Vol, self).__init__()
1407
+ self.gain = gain
1408
+ self.gain_type = gain_type
1409
+
1410
+ if gain_type in ["amplitude", "power"] and gain < 0:
1411
+ raise ValueError("If gain_type = amplitude or power, gain must be positive.")
1412
+
1413
+ def forward(self, waveform: Tensor) -> Tensor:
1414
+ r"""
1415
+ Args:
1416
+ waveform (Tensor): Tensor of audio of dimension `(..., time)`.
1417
+
1418
+ Returns:
1419
+ Tensor: Tensor of audio of dimension `(..., time)`.
1420
+ """
1421
+ if self.gain_type == "amplitude":
1422
+ waveform = waveform * self.gain
1423
+
1424
+ if self.gain_type == "db":
1425
+ waveform = F.gain(waveform, self.gain)
1426
+
1427
+ if self.gain_type == "power":
1428
+ waveform = F.gain(waveform, 10 * math.log10(self.gain))
1429
+
1430
+ return torch.clamp(waveform, -1, 1)
1431
+
1432
+
1433
+ class SlidingWindowCmn(torch.nn.Module):
1434
+ r"""
1435
+ Apply sliding-window cepstral mean (and optionally variance) normalization per utterance.
1436
+
1437
+ .. devices:: CPU CUDA
1438
+
1439
+ .. properties:: Autograd TorchScript
1440
+
1441
+ Args:
1442
+ cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600)
1443
+ min_cmn_window (int, optional): Minimum CMN window used at start of decoding (adds latency only at start).
1444
+ Only applicable if center == false, ignored if center==true (int, default = 100)
1445
+ center (bool, optional): If true, use a window centered on the current frame
1446
+ (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)
1447
+ norm_vars (bool, optional): If true, normalize variance to one. (bool, default = false)
1448
+
1449
+ Example
1450
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
1451
+ >>> transform = transforms.SlidingWindowCmn(cmn_window=1000)
1452
+ >>> cmn_waveform = transform(waveform)
1453
+ """
1454
+
1455
+ def __init__(
1456
+ self, cmn_window: int = 600, min_cmn_window: int = 100, center: bool = False, norm_vars: bool = False
1457
+ ) -> None:
1458
+ super().__init__()
1459
+ self.cmn_window = cmn_window
1460
+ self.min_cmn_window = min_cmn_window
1461
+ self.center = center
1462
+ self.norm_vars = norm_vars
1463
+
1464
+ def forward(self, specgram: Tensor) -> Tensor:
1465
+ r"""
1466
+ Args:
1467
+ specgram (Tensor): Tensor of spectrogram of dimension `(..., time, freq)`.
1468
+
1469
+ Returns:
1470
+ Tensor: Tensor of spectrogram of dimension `(..., time, freq)`.
1471
+ """
1472
+ cmn_specgram = F.sliding_window_cmn(specgram, self.cmn_window, self.min_cmn_window, self.center, self.norm_vars)
1473
+ return cmn_specgram
1474
+
1475
+
1476
+ class Vad(torch.nn.Module):
1477
+ r"""Voice Activity Detector. Similar to SoX implementation.
1478
+
1479
+ .. devices:: CPU CUDA
1480
+
1481
+ .. properties:: TorchScript
1482
+
1483
+ Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
1484
+ The algorithm currently uses a simple cepstral power measurement to detect voice,
1485
+ so may be fooled by other things, especially music.
1486
+
1487
+ The effect can trim only from the front of the audio,
1488
+ so in order to trim from the back, the reverse effect must also be used.
1489
+
1490
+ Args:
1491
+ sample_rate (int): Sample rate of audio signal.
1492
+ trigger_level (float, optional): The measurement level used to trigger activity detection.
1493
+ This may need to be changed depending on the noise level, signal level,
1494
+ and other characteristics of the input audio. (Default: 7.0)
1495
+ trigger_time (float, optional): The time constant (in seconds)
1496
+ used to help ignore short bursts of sound. (Default: 0.25)
1497
+ search_time (float, optional): The amount of audio (in seconds)
1498
+ to search for quieter/shorter bursts of audio to include prior
1499
+ to the detected trigger point. (Default: 1.0)
1500
+ allowed_gap (float, optional): The allowed gap (in seconds) between
1501
+ quiteter/shorter bursts of audio to include prior
1502
+ to the detected trigger point. (Default: 0.25)
1503
+ pre_trigger_time (float, optional): The amount of audio (in seconds) to preserve
1504
+ before the trigger point and any found quieter/shorter bursts. (Default: 0.0)
1505
+ boot_time (float, optional) The algorithm (internally) uses adaptive noise
1506
+ estimation/reduction in order to detect the start of the wanted audio.
1507
+ This option sets the time for the initial noise estimate. (Default: 0.35)
1508
+ noise_up_time (float, optional) Time constant used by the adaptive noise estimator
1509
+ for when the noise level is increasing. (Default: 0.1)
1510
+ noise_down_time (float, optional) Time constant used by the adaptive noise estimator
1511
+ for when the noise level is decreasing. (Default: 0.01)
1512
+ noise_reduction_amount (float, optional) Amount of noise reduction to use in
1513
+ the detection algorithm (e.g. 0, 0.5, ...). (Default: 1.35)
1514
+ measure_freq (float, optional) Frequency of the algorithm’s
1515
+ processing/measurements. (Default: 20.0)
1516
+ measure_duration: (float or None, optional) Measurement duration.
1517
+ (Default: Twice the measurement period; i.e. with overlap.)
1518
+ measure_smooth_time (float, optional) Time constant used to smooth
1519
+ spectral measurements. (Default: 0.4)
1520
+ hp_filter_freq (float, optional) "Brick-wall" frequency of high-pass filter applied
1521
+ at the input to the detector algorithm. (Default: 50.0)
1522
+ lp_filter_freq (float, optional) "Brick-wall" frequency of low-pass filter applied
1523
+ at the input to the detector algorithm. (Default: 6000.0)
1524
+ hp_lifter_freq (float, optional) "Brick-wall" frequency of high-pass lifter used
1525
+ in the detector algorithm. (Default: 150.0)
1526
+ lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used
1527
+ in the detector algorithm. (Default: 2000.0)
1528
+
1529
+ Example
1530
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
1531
+ >>> waveform_reversed, sample_rate = apply_effects_tensor(waveform, sample_rate, [["reverse"]])
1532
+ >>> transform = transforms.Vad(sample_rate=sample_rate, trigger_level=7.5)
1533
+ >>> waveform_reversed_front_trim = transform(waveform_reversed)
1534
+ >>> waveform_end_trim, sample_rate = apply_effects_tensor(
1535
+ >>> waveform_reversed_front_trim, sample_rate, [["reverse"]]
1536
+ >>> )
1537
+
1538
+ Reference:
1539
+ - http://sox.sourceforge.net/sox.html
1540
+ """
1541
+
1542
+ def __init__(
1543
+ self,
1544
+ sample_rate: int,
1545
+ trigger_level: float = 7.0,
1546
+ trigger_time: float = 0.25,
1547
+ search_time: float = 1.0,
1548
+ allowed_gap: float = 0.25,
1549
+ pre_trigger_time: float = 0.0,
1550
+ boot_time: float = 0.35,
1551
+ noise_up_time: float = 0.1,
1552
+ noise_down_time: float = 0.01,
1553
+ noise_reduction_amount: float = 1.35,
1554
+ measure_freq: float = 20.0,
1555
+ measure_duration: Optional[float] = None,
1556
+ measure_smooth_time: float = 0.4,
1557
+ hp_filter_freq: float = 50.0,
1558
+ lp_filter_freq: float = 6000.0,
1559
+ hp_lifter_freq: float = 150.0,
1560
+ lp_lifter_freq: float = 2000.0,
1561
+ ) -> None:
1562
+ super().__init__()
1563
+
1564
+ self.sample_rate = sample_rate
1565
+ self.trigger_level = trigger_level
1566
+ self.trigger_time = trigger_time
1567
+ self.search_time = search_time
1568
+ self.allowed_gap = allowed_gap
1569
+ self.pre_trigger_time = pre_trigger_time
1570
+ self.boot_time = boot_time
1571
+ self.noise_up_time = noise_up_time
1572
+ self.noise_down_time = noise_down_time
1573
+ self.noise_reduction_amount = noise_reduction_amount
1574
+ self.measure_freq = measure_freq
1575
+ self.measure_duration = measure_duration
1576
+ self.measure_smooth_time = measure_smooth_time
1577
+ self.hp_filter_freq = hp_filter_freq
1578
+ self.lp_filter_freq = lp_filter_freq
1579
+ self.hp_lifter_freq = hp_lifter_freq
1580
+ self.lp_lifter_freq = lp_lifter_freq
1581
+
1582
+ def forward(self, waveform: Tensor) -> Tensor:
1583
+ r"""
1584
+ Args:
1585
+ waveform (Tensor): Tensor of audio of dimension `(channels, time)` or `(time)`
1586
+ Tensor of shape `(channels, time)` is treated as a multi-channel recording
1587
+ of the same event and the resulting output will be trimmed to the earliest
1588
+ voice activity in any channel.
1589
+ """
1590
+ return F.vad(
1591
+ waveform=waveform,
1592
+ sample_rate=self.sample_rate,
1593
+ trigger_level=self.trigger_level,
1594
+ trigger_time=self.trigger_time,
1595
+ search_time=self.search_time,
1596
+ allowed_gap=self.allowed_gap,
1597
+ pre_trigger_time=self.pre_trigger_time,
1598
+ boot_time=self.boot_time,
1599
+ noise_up_time=self.noise_up_time,
1600
+ noise_down_time=self.noise_down_time,
1601
+ noise_reduction_amount=self.noise_reduction_amount,
1602
+ measure_freq=self.measure_freq,
1603
+ measure_duration=self.measure_duration,
1604
+ measure_smooth_time=self.measure_smooth_time,
1605
+ hp_filter_freq=self.hp_filter_freq,
1606
+ lp_filter_freq=self.lp_filter_freq,
1607
+ hp_lifter_freq=self.hp_lifter_freq,
1608
+ lp_lifter_freq=self.lp_lifter_freq,
1609
+ )
1610
+
1611
+
1612
+ class SpectralCentroid(torch.nn.Module):
1613
+ r"""Compute the spectral centroid for each channel along the time axis.
1614
+
1615
+ .. devices:: CPU CUDA
1616
+
1617
+ .. properties:: Autograd TorchScript
1618
+
1619
+ The spectral centroid is defined as the weighted average of the
1620
+ frequency values, weighted by their magnitude.
1621
+
1622
+ Args:
1623
+ sample_rate (int): Sample rate of audio signal.
1624
+ n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
1625
+ win_length (int or None, optional): Window size. (Default: ``n_fft``)
1626
+ hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
1627
+ pad (int, optional): Two sided padding of signal. (Default: ``0``)
1628
+ window_fn (Callable[..., Tensor], optional): A function to create a window tensor
1629
+ that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
1630
+ wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
1631
+
1632
+ Example
1633
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
1634
+ >>> transform = transforms.SpectralCentroid(sample_rate)
1635
+ >>> spectral_centroid = transform(waveform) # (channel, time)
1636
+ """
1637
+ __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad"]
1638
+
1639
+ def __init__(
1640
+ self,
1641
+ sample_rate: int,
1642
+ n_fft: int = 400,
1643
+ win_length: Optional[int] = None,
1644
+ hop_length: Optional[int] = None,
1645
+ pad: int = 0,
1646
+ window_fn: Callable[..., Tensor] = torch.hann_window,
1647
+ wkwargs: Optional[dict] = None,
1648
+ ) -> None:
1649
+ super(SpectralCentroid, self).__init__()
1650
+ self.sample_rate = sample_rate
1651
+ self.n_fft = n_fft
1652
+ self.win_length = win_length if win_length is not None else n_fft
1653
+ self.hop_length = hop_length if hop_length is not None else self.win_length // 2
1654
+ window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
1655
+ self.register_buffer("window", window)
1656
+ self.pad = pad
1657
+
1658
+ def forward(self, waveform: Tensor) -> Tensor:
1659
+ r"""
1660
+ Args:
1661
+ waveform (Tensor): Tensor of audio of dimension `(..., time)`.
1662
+
1663
+ Returns:
1664
+ Tensor: Spectral Centroid of size `(..., time)`.
1665
+ """
1666
+
1667
+ return F.spectral_centroid(
1668
+ waveform, self.sample_rate, self.pad, self.window, self.n_fft, self.hop_length, self.win_length
1669
+ )
1670
+
1671
+
1672
+ class PitchShift(LazyModuleMixin, torch.nn.Module):
1673
+ r"""Shift the pitch of a waveform by ``n_steps`` steps.
1674
+
1675
+ .. devices:: CPU CUDA
1676
+
1677
+ .. properties:: TorchScript
1678
+
1679
+ Args:
1680
+ waveform (Tensor): The input waveform of shape `(..., time)`.
1681
+ sample_rate (int): Sample rate of `waveform`.
1682
+ n_steps (int): The (fractional) steps to shift `waveform`.
1683
+ bins_per_octave (int, optional): The number of steps per octave (Default : ``12``).
1684
+ n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins (Default: ``512``).
1685
+ win_length (int or None, optional): Window size. If None, then ``n_fft`` is used. (Default: ``None``).
1686
+ hop_length (int or None, optional): Length of hop between STFT windows. If None, then ``win_length // 4``
1687
+ is used (Default: ``None``).
1688
+ window (Tensor or None, optional): Window tensor that is applied/multiplied to each frame/window.
1689
+ If None, then ``torch.hann_window(win_length)`` is used (Default: ``None``).
1690
+
1691
+ Example
1692
+ >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
1693
+ >>> transform = transforms.PitchShift(sample_rate, 4)
1694
+ >>> waveform_shift = transform(waveform) # (channel, time)
1695
+ """
1696
+ __constants__ = ["sample_rate", "n_steps", "bins_per_octave", "n_fft", "win_length", "hop_length"]
1697
+
1698
+ kernel: UninitializedParameter
1699
+ width: int
1700
+
1701
+ def __init__(
1702
+ self,
1703
+ sample_rate: int,
1704
+ n_steps: int,
1705
+ bins_per_octave: int = 12,
1706
+ n_fft: int = 512,
1707
+ win_length: Optional[int] = None,
1708
+ hop_length: Optional[int] = None,
1709
+ window_fn: Callable[..., Tensor] = torch.hann_window,
1710
+ wkwargs: Optional[dict] = None,
1711
+ ) -> None:
1712
+ super().__init__()
1713
+ self.n_steps = n_steps
1714
+ self.bins_per_octave = bins_per_octave
1715
+ self.sample_rate = sample_rate
1716
+ self.n_fft = n_fft
1717
+ self.win_length = win_length if win_length is not None else n_fft
1718
+ self.hop_length = hop_length if hop_length is not None else self.win_length // 4
1719
+ window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
1720
+ self.register_buffer("window", window)
1721
+ rate = 2.0 ** (-float(n_steps) / bins_per_octave)
1722
+ self.orig_freq = int(sample_rate / rate)
1723
+ self.gcd = math.gcd(int(self.orig_freq), int(sample_rate))
1724
+
1725
+ if self.orig_freq != sample_rate:
1726
+ self.width = -1
1727
+ self.kernel = UninitializedParameter(device=None, dtype=None)
1728
+
1729
+ def initialize_parameters(self, input):
1730
+ if self.has_uninitialized_params():
1731
+ if self.orig_freq != self.sample_rate:
1732
+ with torch.no_grad():
1733
+ kernel, self.width = _get_sinc_resample_kernel(
1734
+ self.orig_freq,
1735
+ self.sample_rate,
1736
+ self.gcd,
1737
+ dtype=input.dtype,
1738
+ device=input.device,
1739
+ )
1740
+ self.kernel.materialize(kernel.shape)
1741
+ self.kernel.copy_(kernel)
1742
+
1743
+ def forward(self, waveform: Tensor) -> Tensor:
1744
+ r"""
1745
+ Args:
1746
+ waveform (Tensor): Tensor of audio of dimension `(..., time)`.
1747
+
1748
+ Returns:
1749
+ Tensor: The pitch-shifted audio of shape `(..., time)`.
1750
+ """
1751
+ shape = waveform.size()
1752
+
1753
+ waveform_stretch = _stretch_waveform(
1754
+ waveform,
1755
+ self.n_steps,
1756
+ self.bins_per_octave,
1757
+ self.n_fft,
1758
+ self.win_length,
1759
+ self.hop_length,
1760
+ self.window,
1761
+ )
1762
+
1763
+ if self.orig_freq != self.sample_rate:
1764
+ waveform_shift = _apply_sinc_resample_kernel(
1765
+ waveform_stretch,
1766
+ self.orig_freq,
1767
+ self.sample_rate,
1768
+ self.gcd,
1769
+ self.kernel,
1770
+ self.width,
1771
+ )
1772
+ else:
1773
+ waveform_shift = waveform_stretch
1774
+
1775
+ return _fix_waveform_shape(
1776
+ waveform_shift,
1777
+ shape,
1778
+ )
1779
+
1780
+
1781
+ class RNNTLoss(torch.nn.Module):
1782
+ """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
1783
+ :cite:`graves2012sequence`.
1784
+
1785
+ .. devices:: CPU CUDA
1786
+
1787
+ .. properties:: Autograd TorchScript
1788
+
1789
+ The RNN Transducer loss extends the CTC loss by defining a distribution over output
1790
+ sequences of all lengths, and by jointly modelling both input-output and output-output
1791
+ dependencies.
1792
+
1793
+ Args:
1794
+ blank (int, optional): blank label (Default: ``-1``)
1795
+ clamp (float, optional): clamp for gradients (Default: ``-1``)
1796
+ reduction (string, optional): Specifies the reduction to apply to the output:
1797
+ ``"none"`` | ``"mean"`` | ``"sum"``. (Default: ``"mean"``)
1798
+ fused_log_softmax (bool): set to False if calling log_softmax outside of loss (Default: ``True``)
1799
+
1800
+ Example
1801
+ >>> # Hypothetical values
1802
+ >>> logits = torch.tensor([[[[0.1, 0.6, 0.1, 0.1, 0.1],
1803
+ >>> [0.1, 0.1, 0.6, 0.1, 0.1],
1804
+ >>> [0.1, 0.1, 0.2, 0.8, 0.1]],
1805
+ >>> [[0.1, 0.6, 0.1, 0.1, 0.1],
1806
+ >>> [0.1, 0.1, 0.2, 0.1, 0.1],
1807
+ >>> [0.7, 0.1, 0.2, 0.1, 0.1]]]],
1808
+ >>> dtype=torch.float32,
1809
+ >>> requires_grad=True)
1810
+ >>> targets = torch.tensor([[1, 2]], dtype=torch.int)
1811
+ >>> logit_lengths = torch.tensor([2], dtype=torch.int)
1812
+ >>> target_lengths = torch.tensor([2], dtype=torch.int)
1813
+ >>> transform = transforms.RNNTLoss(blank=0)
1814
+ >>> loss = transform(logits, targets, logit_lengths, target_lengths)
1815
+ >>> loss.backward()
1816
+ """
1817
+
1818
+ def __init__(
1819
+ self,
1820
+ blank: int = -1,
1821
+ clamp: float = -1.0,
1822
+ reduction: str = "mean",
1823
+ fused_log_softmax: bool = True,
1824
+ ):
1825
+ super().__init__()
1826
+ self.blank = blank
1827
+ self.clamp = clamp
1828
+ self.reduction = reduction
1829
+ self.fused_log_softmax = fused_log_softmax
1830
+
1831
+ def forward(
1832
+ self,
1833
+ logits: Tensor,
1834
+ targets: Tensor,
1835
+ logit_lengths: Tensor,
1836
+ target_lengths: Tensor,
1837
+ ):
1838
+ """
1839
+ Args:
1840
+ logits (Tensor): Tensor of dimension `(batch, max seq length, max target length + 1, class)`
1841
+ containing output from joiner
1842
+ targets (Tensor): Tensor of dimension `(batch, max target length)` containing targets with zero padded
1843
+ logit_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of each sequence from encoder
1844
+ target_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of targets for each sequence
1845
+ Returns:
1846
+ Tensor: Loss with the reduction option applied. If ``reduction`` is ``"none"``, then size (batch),
1847
+ otherwise scalar.
1848
+ """
1849
+ return F.rnnt_loss(
1850
+ logits,
1851
+ targets,
1852
+ logit_lengths,
1853
+ target_lengths,
1854
+ self.blank,
1855
+ self.clamp,
1856
+ self.reduction,
1857
+ self.fused_log_softmax,
1858
+ )
1859
+
1860
+
1861
+ class Convolve(torch.nn.Module):
1862
+ r"""
1863
+ Convolves inputs along their last dimension using the direct method.
1864
+ Note that, in contrast to :class:`torch.nn.Conv1d`, which actually applies the valid cross-correlation
1865
+ operator, this module applies the true `convolution`_ operator.
1866
+
1867
+ .. devices:: CPU CUDA
1868
+
1869
+ .. properties:: Autograd TorchScript
1870
+
1871
+ Args:
1872
+ mode (str, optional): Must be one of ("full", "valid", "same").
1873
+
1874
+ * "full": Returns the full convolution result, with shape `(..., N + M - 1)`, where
1875
+ `N` and `M` are the trailing dimensions of the two inputs. (Default)
1876
+ * "valid": Returns the segment of the full convolution result corresponding to where
1877
+ the two inputs overlap completely, with shape `(..., max(N, M) - min(N, M) + 1)`.
1878
+ * "same": Returns the center segment of the full convolution result, with shape `(..., N)`.
1879
+
1880
+ .. _convolution:
1881
+ https://en.wikipedia.org/wiki/Convolution
1882
+ """
1883
+
1884
+ def __init__(self, mode: str = "full") -> None:
1885
+ _check_convolve_mode(mode)
1886
+
1887
+ super().__init__()
1888
+ self.mode = mode
1889
+
1890
+ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
1891
+ r"""
1892
+ Args:
1893
+ x (torch.Tensor): First convolution operand, with shape `(..., N)`.
1894
+ y (torch.Tensor): Second convolution operand, with shape `(..., M)`
1895
+ (leading dimensions must be broadcast-able with those of ``x``).
1896
+
1897
+ Returns:
1898
+ torch.Tensor: Result of convolving ``x`` and ``y``, with shape `(..., L)`, where
1899
+ the leading dimensions match those of ``x`` and `L` is dictated by ``mode``.
1900
+ """
1901
+ return F.convolve(x, y, mode=self.mode)
1902
+
1903
+
1904
+ class FFTConvolve(torch.nn.Module):
1905
+ r"""
1906
+ Convolves inputs along their last dimension using FFT. For inputs with large last dimensions, this module
1907
+ is generally much faster than :class:`Convolve`.
1908
+ Note that, in contrast to :class:`torch.nn.Conv1d`, which actually applies the valid cross-correlation
1909
+ operator, this module applies the true `convolution`_ operator.
1910
+ Also note that this module can only output float tensors (int tensor inputs will be cast to float).
1911
+
1912
+ .. devices:: CPU CUDA
1913
+
1914
+ .. properties:: Autograd TorchScript
1915
+
1916
+ Args:
1917
+ mode (str, optional): Must be one of ("full", "valid", "same").
1918
+
1919
+ * "full": Returns the full convolution result, with shape `(..., N + M - 1)`, where
1920
+ `N` and `M` are the trailing dimensions of the two inputs. (Default)
1921
+ * "valid": Returns the segment of the full convolution result corresponding to where
1922
+ the two inputs overlap completely, with shape `(..., max(N, M) - min(N, M) + 1)`.
1923
+ * "same": Returns the center segment of the full convolution result, with shape `(..., N)`.
1924
+
1925
+ .. _convolution:
1926
+ https://en.wikipedia.org/wiki/Convolution
1927
+ """
1928
+
1929
+ def __init__(self, mode: str = "full") -> None:
1930
+ _check_convolve_mode(mode)
1931
+
1932
+ super().__init__()
1933
+ self.mode = mode
1934
+
1935
+ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
1936
+ r"""
1937
+ Args:
1938
+ x (torch.Tensor): First convolution operand, with shape `(..., N)`.
1939
+ y (torch.Tensor): Second convolution operand, with shape `(..., M)`
1940
+ (leading dimensions must be broadcast-able with those of ``x``).
1941
+
1942
+ Returns:
1943
+ torch.Tensor: Result of convolving ``x`` and ``y``, with shape `(..., L)`, where
1944
+ the leading dimensions match those of ``x`` and `L` is dictated by ``mode``.
1945
+ """
1946
+ return F.fftconvolve(x, y, mode=self.mode)
1947
+
1948
+
1949
+ def _source_target_sample_rate(orig_freq: int, speed: float) -> Tuple[int, int]:
1950
+ source_sample_rate = int(speed * orig_freq)
1951
+ target_sample_rate = int(orig_freq)
1952
+ gcd = math.gcd(source_sample_rate, target_sample_rate)
1953
+ return source_sample_rate // gcd, target_sample_rate // gcd
1954
+
1955
+
1956
+ class Speed(torch.nn.Module):
1957
+ r"""Adjusts waveform speed.
1958
+
1959
+ .. devices:: CPU CUDA
1960
+
1961
+ .. properties:: Autograd TorchScript
1962
+
1963
+ Args:
1964
+ orig_freq (int): Original frequency of the signals in ``waveform``.
1965
+ factor (float): Factor by which to adjust speed of input. Values greater than 1.0
1966
+ compress ``waveform`` in time, whereas values less than 1.0 stretch ``waveform`` in time.
1967
+ """
1968
+
1969
+ def __init__(self, orig_freq, factor) -> None:
1970
+ super().__init__()
1971
+
1972
+ self.orig_freq = orig_freq
1973
+ self.factor = factor
1974
+
1975
+ self.source_sample_rate, self.target_sample_rate = _source_target_sample_rate(orig_freq, factor)
1976
+ self.resampler = Resample(orig_freq=self.source_sample_rate, new_freq=self.target_sample_rate)
1977
+
1978
+ def forward(self, waveform, lengths: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
1979
+ r"""
1980
+ Args:
1981
+ waveform (torch.Tensor): Input signals, with shape `(..., time)`.
1982
+ lengths (torch.Tensor or None, optional): Valid lengths of signals in ``waveform``, with shape `(...)`.
1983
+ If ``None``, all elements in ``waveform`` are treated as valid. (Default: ``None``)
1984
+
1985
+ Returns:
1986
+ (torch.Tensor, torch.Tensor or None):
1987
+ torch.Tensor
1988
+ Speed-adjusted waveform, with shape `(..., new_time).`
1989
+ torch.Tensor or None
1990
+ If ``lengths`` is not ``None``, valid lengths of signals in speed-adjusted waveform,
1991
+ with shape `(...)`; otherwise, ``None``.
1992
+ """
1993
+
1994
+ if lengths is None:
1995
+ out_lengths = None
1996
+ else:
1997
+ out_lengths = torch.ceil(lengths * self.target_sample_rate / self.source_sample_rate).to(lengths.dtype)
1998
+
1999
+ return self.resampler(waveform), out_lengths
2000
+
2001
+
2002
+ class SpeedPerturbation(torch.nn.Module):
2003
+ r"""Applies the speed perturbation augmentation introduced in
2004
+ *Audio augmentation for speech recognition* :cite:`ko15_interspeech`. For a given input,
2005
+ the module samples a speed-up factor from ``factors`` uniformly at random and adjusts
2006
+ the speed of the input by that factor.
2007
+
2008
+ .. devices:: CPU CUDA
2009
+
2010
+ .. properties:: Autograd TorchScript
2011
+
2012
+ Args:
2013
+ orig_freq (int): Original frequency of the signals in ``waveform``.
2014
+ factors (Sequence[float]): Factors by which to adjust speed of input. Values greater than 1.0
2015
+ compress ``waveform`` in time, whereas values less than 1.0 stretch ``waveform`` in time.
2016
+
2017
+ Example
2018
+ >>> speed_perturb = SpeedPerturbation(16000, [0.9, 1.1, 1.0, 1.0, 1.0])
2019
+ >>> # waveform speed will be adjusted by factor 0.9 with 20% probability,
2020
+ >>> # 1.1 with 20% probability, and 1.0 (i.e. kept the same) with 60% probability.
2021
+ >>> speed_perturbed_waveform = speed_perturb(waveform, lengths)
2022
+ """
2023
+
2024
+ def __init__(self, orig_freq: int, factors: Sequence[float]) -> None:
2025
+ super().__init__()
2026
+
2027
+ self.speeders = torch.nn.ModuleList([Speed(orig_freq=orig_freq, factor=factor) for factor in factors])
2028
+
2029
+ def forward(
2030
+ self, waveform: torch.Tensor, lengths: Optional[torch.Tensor] = None
2031
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
2032
+ r"""
2033
+ Args:
2034
+ waveform (torch.Tensor): Input signals, with shape `(..., time)`.
2035
+ lengths (torch.Tensor or None, optional): Valid lengths of signals in ``waveform``, with shape `(...)`.
2036
+ If ``None``, all elements in ``waveform`` are treated as valid. (Default: ``None``)
2037
+
2038
+ Returns:
2039
+ (torch.Tensor, torch.Tensor or None):
2040
+ torch.Tensor
2041
+ Speed-adjusted waveform, with shape `(..., new_time).`
2042
+ torch.Tensor or None
2043
+ If ``lengths`` is not ``None``, valid lengths of signals in speed-adjusted waveform,
2044
+ with shape `(...)`; otherwise, ``None``.
2045
+ """
2046
+
2047
+ idx = int(torch.randint(len(self.speeders), ()))
2048
+ # NOTE: we do this because TorchScript doesn't allow for
2049
+ # indexing ModuleList instances with non-literals.
2050
+ for speeder_idx, speeder in enumerate(self.speeders):
2051
+ if idx == speeder_idx:
2052
+ return speeder(waveform, lengths)
2053
+ raise RuntimeError("Speeder not found; execution should have never reached here.")
2054
+
2055
+
2056
+ class AddNoise(torch.nn.Module):
2057
+ r"""Scales and adds noise to waveform per signal-to-noise ratio.
2058
+ See :meth:`torchaudio.functional.add_noise` for more details.
2059
+
2060
+ .. devices:: CPU CUDA
2061
+
2062
+ .. properties:: Autograd TorchScript
2063
+ """
2064
+
2065
+ def forward(
2066
+ self, waveform: torch.Tensor, noise: torch.Tensor, snr: torch.Tensor, lengths: Optional[torch.Tensor] = None
2067
+ ) -> torch.Tensor:
2068
+ r"""
2069
+ Args:
2070
+ waveform (torch.Tensor): Input waveform, with shape `(..., L)`.
2071
+ noise (torch.Tensor): Noise, with shape `(..., L)` (same shape as ``waveform``).
2072
+ snr (torch.Tensor): Signal-to-noise ratios in dB, with shape `(...,)`.
2073
+ lengths (torch.Tensor or None, optional): Valid lengths of signals in ``waveform`` and ``noise``,
2074
+ with shape `(...,)` (leading dimensions must match those of ``waveform``). If ``None``, all
2075
+ elements in ``waveform`` and ``noise`` are treated as valid. (Default: ``None``)
2076
+
2077
+ Returns:
2078
+ torch.Tensor: Result of scaling and adding ``noise`` to ``waveform``, with shape `(..., L)`
2079
+ (same shape as ``waveform``).
2080
+ """
2081
+ return F.add_noise(waveform, noise, snr, lengths)
2082
+
2083
+
2084
+ class Preemphasis(torch.nn.Module):
2085
+ r"""Pre-emphasizes a waveform along its last dimension.
2086
+ See :meth:`torchaudio.functional.preemphasis` for more details.
2087
+
2088
+ .. devices:: CPU CUDA
2089
+
2090
+ .. properties:: Autograd TorchScript
2091
+
2092
+ Args:
2093
+ coeff (float, optional): Pre-emphasis coefficient. Typically between 0.0 and 1.0.
2094
+ (Default: 0.97)
2095
+ """
2096
+
2097
+ def __init__(self, coeff: float = 0.97) -> None:
2098
+ super().__init__()
2099
+ self.coeff = coeff
2100
+
2101
+ def forward(self, waveform: torch.Tensor) -> torch.Tensor:
2102
+ r"""
2103
+ Args:
2104
+ waveform (torch.Tensor): Waveform, with shape `(..., N)`.
2105
+
2106
+ Returns:
2107
+ torch.Tensor: Pre-emphasized waveform, with shape `(..., N)`.
2108
+ """
2109
+ return F.preemphasis(waveform, coeff=self.coeff)
2110
+
2111
+
2112
+ class Deemphasis(torch.nn.Module):
2113
+ r"""De-emphasizes a waveform along its last dimension.
2114
+ See :meth:`torchaudio.functional.deemphasis` for more details.
2115
+
2116
+ .. devices:: CPU CUDA
2117
+
2118
+ .. properties:: Autograd TorchScript
2119
+
2120
+ Args:
2121
+ coeff (float, optional): De-emphasis coefficient. Typically between 0.0 and 1.0.
2122
+ (Default: 0.97)
2123
+ """
2124
+
2125
+ def __init__(self, coeff: float = 0.97) -> None:
2126
+ super().__init__()
2127
+ self.coeff = coeff
2128
+
2129
+ def forward(self, waveform: torch.Tensor) -> torch.Tensor:
2130
+ r"""
2131
+ Args:
2132
+ waveform (torch.Tensor): Waveform, with shape `(..., N)`.
2133
+
2134
+ Returns:
2135
+ torch.Tensor: De-emphasized waveform, with shape `(..., N)`.
2136
+ """
2137
+ return F.deemphasis(waveform, coeff=self.coeff)