torchaudio 2.9.1__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. torchaudio/.dylibs/libc++.1.0.dylib +0 -0
  2. torchaudio/__init__.py +204 -0
  3. torchaudio/_extension/__init__.py +61 -0
  4. torchaudio/_extension/utils.py +133 -0
  5. torchaudio/_internal/__init__.py +10 -0
  6. torchaudio/_internal/module_utils.py +171 -0
  7. torchaudio/_torchcodec.py +340 -0
  8. torchaudio/compliance/__init__.py +5 -0
  9. torchaudio/compliance/kaldi.py +813 -0
  10. torchaudio/datasets/__init__.py +47 -0
  11. torchaudio/datasets/cmuarctic.py +157 -0
  12. torchaudio/datasets/cmudict.py +186 -0
  13. torchaudio/datasets/commonvoice.py +86 -0
  14. torchaudio/datasets/dr_vctk.py +121 -0
  15. torchaudio/datasets/fluentcommands.py +108 -0
  16. torchaudio/datasets/gtzan.py +1118 -0
  17. torchaudio/datasets/iemocap.py +147 -0
  18. torchaudio/datasets/librilight_limited.py +111 -0
  19. torchaudio/datasets/librimix.py +133 -0
  20. torchaudio/datasets/librispeech.py +174 -0
  21. torchaudio/datasets/librispeech_biasing.py +189 -0
  22. torchaudio/datasets/libritts.py +168 -0
  23. torchaudio/datasets/ljspeech.py +107 -0
  24. torchaudio/datasets/musdb_hq.py +139 -0
  25. torchaudio/datasets/quesst14.py +136 -0
  26. torchaudio/datasets/snips.py +157 -0
  27. torchaudio/datasets/speechcommands.py +183 -0
  28. torchaudio/datasets/tedlium.py +218 -0
  29. torchaudio/datasets/utils.py +54 -0
  30. torchaudio/datasets/vctk.py +143 -0
  31. torchaudio/datasets/voxceleb1.py +309 -0
  32. torchaudio/datasets/yesno.py +89 -0
  33. torchaudio/functional/__init__.py +130 -0
  34. torchaudio/functional/_alignment.py +128 -0
  35. torchaudio/functional/filtering.py +1685 -0
  36. torchaudio/functional/functional.py +2505 -0
  37. torchaudio/lib/__init__.py +0 -0
  38. torchaudio/lib/_torchaudio.so +0 -0
  39. torchaudio/lib/libtorchaudio.so +0 -0
  40. torchaudio/models/__init__.py +85 -0
  41. torchaudio/models/_hdemucs.py +1008 -0
  42. torchaudio/models/conformer.py +293 -0
  43. torchaudio/models/conv_tasnet.py +330 -0
  44. torchaudio/models/decoder/__init__.py +64 -0
  45. torchaudio/models/decoder/_ctc_decoder.py +568 -0
  46. torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
  47. torchaudio/models/deepspeech.py +84 -0
  48. torchaudio/models/emformer.py +884 -0
  49. torchaudio/models/rnnt.py +816 -0
  50. torchaudio/models/rnnt_decoder.py +339 -0
  51. torchaudio/models/squim/__init__.py +11 -0
  52. torchaudio/models/squim/objective.py +326 -0
  53. torchaudio/models/squim/subjective.py +150 -0
  54. torchaudio/models/tacotron2.py +1046 -0
  55. torchaudio/models/wav2letter.py +72 -0
  56. torchaudio/models/wav2vec2/__init__.py +45 -0
  57. torchaudio/models/wav2vec2/components.py +1167 -0
  58. torchaudio/models/wav2vec2/model.py +1579 -0
  59. torchaudio/models/wav2vec2/utils/__init__.py +7 -0
  60. torchaudio/models/wav2vec2/utils/import_fairseq.py +213 -0
  61. torchaudio/models/wav2vec2/utils/import_huggingface.py +134 -0
  62. torchaudio/models/wav2vec2/wavlm_attention.py +214 -0
  63. torchaudio/models/wavernn.py +409 -0
  64. torchaudio/pipelines/__init__.py +102 -0
  65. torchaudio/pipelines/_source_separation_pipeline.py +109 -0
  66. torchaudio/pipelines/_squim_pipeline.py +156 -0
  67. torchaudio/pipelines/_tts/__init__.py +16 -0
  68. torchaudio/pipelines/_tts/impl.py +385 -0
  69. torchaudio/pipelines/_tts/interface.py +255 -0
  70. torchaudio/pipelines/_tts/utils.py +230 -0
  71. torchaudio/pipelines/_wav2vec2/__init__.py +0 -0
  72. torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
  73. torchaudio/pipelines/_wav2vec2/impl.py +1699 -0
  74. torchaudio/pipelines/_wav2vec2/utils.py +346 -0
  75. torchaudio/pipelines/rnnt_pipeline.py +380 -0
  76. torchaudio/transforms/__init__.py +78 -0
  77. torchaudio/transforms/_multi_channel.py +467 -0
  78. torchaudio/transforms/_transforms.py +2138 -0
  79. torchaudio/utils/__init__.py +4 -0
  80. torchaudio/utils/download.py +89 -0
  81. torchaudio/version.py +2 -0
  82. torchaudio-2.9.1.dist-info/METADATA +133 -0
  83. torchaudio-2.9.1.dist-info/RECORD +86 -0
  84. torchaudio-2.9.1.dist-info/WHEEL +5 -0
  85. torchaudio-2.9.1.dist-info/licenses/LICENSE +25 -0
  86. torchaudio-2.9.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1685 @@
1
+ import math
2
+ import warnings
3
+ from typing import Optional
4
+
5
+ import torch
6
+ import torch.nn.functional as F
7
+ from torch import Tensor
8
+
9
+ from torchaudio._extension import _IS_TORCHAUDIO_EXT_AVAILABLE
10
+
11
+
12
+ def _dB2Linear(x: float) -> float:
13
+ return math.exp(x * math.log(10) / 20.0)
14
+
15
+
16
+ def _generate_wave_table(
17
+ wave_type: str,
18
+ data_type: str,
19
+ table_size: int,
20
+ min: float,
21
+ max: float,
22
+ phase: float,
23
+ device: torch.device,
24
+ ) -> Tensor:
25
+ r"""A helper function for phaser. Generates a table with given parameters.
26
+
27
+ Args:
28
+ wave_type (str): SINE or TRIANGULAR
29
+ data_type (str): desired data_type ( `INT` or `FLOAT` )
30
+ table_size (int): desired table size
31
+ min (float): desired min value
32
+ max (float): desired max value
33
+ phase (float): desired phase
34
+ device (torch.device): Torch device on which table must be generated
35
+ Returns:
36
+ Tensor: A 1D tensor with wave table values
37
+ """
38
+
39
+ phase_offset = int(phase / math.pi / 2 * table_size + 0.5)
40
+
41
+ t = torch.arange(table_size, device=device, dtype=torch.int32)
42
+
43
+ point = (t + phase_offset) % table_size
44
+
45
+ d = torch.zeros_like(point, device=device, dtype=torch.float64)
46
+
47
+ if wave_type == "SINE":
48
+ d = (torch.sin(point.to(torch.float64) / table_size * 2 * math.pi) + 1) / 2
49
+ elif wave_type == "TRIANGLE":
50
+ d = point.to(torch.float64) * 2 / table_size
51
+ value = torch.div(4 * point, table_size, rounding_mode="floor")
52
+ d[value == 0] = d[value == 0] + 0.5
53
+ d[value == 1] = 1.5 - d[value == 1]
54
+ d[value == 2] = 1.5 - d[value == 2]
55
+ d[value == 3] = d[value == 3] - 1.5
56
+
57
+ d = d * (max - min) + min
58
+
59
+ if data_type == "INT":
60
+ mask = d < 0
61
+ d[mask] = d[mask] - 0.5
62
+ d[~mask] = d[~mask] + 0.5
63
+ d = d.to(torch.int32)
64
+ elif data_type == "FLOAT":
65
+ d = d.to(torch.float32)
66
+
67
+ return d
68
+
69
+
70
+ def allpass_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
71
+ r"""Design two-pole all-pass filter. Similar to SoX implementation.
72
+
73
+ .. devices:: CPU CUDA
74
+
75
+ .. properties:: Autograd TorchScript
76
+
77
+ Args:
78
+ waveform(torch.Tensor): audio waveform of dimension of `(..., time)`
79
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
80
+ central_freq (float or torch.Tensor): central frequency (in Hz)
81
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
82
+
83
+ Returns:
84
+ Tensor: Waveform of dimension of `(..., time)`
85
+
86
+ Reference:
87
+ - http://sox.sourceforge.net/sox.html
88
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
89
+ """
90
+ dtype = waveform.dtype
91
+ device = waveform.device
92
+ central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
93
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
94
+
95
+ w0 = 2 * math.pi * central_freq / sample_rate
96
+
97
+ alpha = torch.sin(w0) / 2 / Q
98
+
99
+ b0 = 1 - alpha
100
+ b1 = -2 * torch.cos(w0)
101
+ b2 = 1 + alpha
102
+ a0 = 1 + alpha
103
+ a1 = -2 * torch.cos(w0)
104
+ a2 = 1 - alpha
105
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
106
+
107
+
108
+ def band_biquad(
109
+ waveform: Tensor,
110
+ sample_rate: int,
111
+ central_freq: float,
112
+ Q: float = 0.707,
113
+ noise: bool = False,
114
+ ) -> Tensor:
115
+ r"""Design two-pole band filter. Similar to SoX implementation.
116
+
117
+ .. devices:: CPU CUDA
118
+
119
+ .. properties:: Autograd TorchScript
120
+
121
+ Args:
122
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
123
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
124
+ central_freq (float or torch.Tensor): central frequency (in Hz)
125
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
126
+ noise (bool, optional) : If ``True``, uses the alternate mode for un-pitched audio (e.g. percussion).
127
+ If ``False``, uses mode oriented to pitched audio, i.e. voice, singing,
128
+ or instrumental music (Default: ``False``).
129
+
130
+ Returns:
131
+ Tensor: Waveform of dimension of `(..., time)`
132
+
133
+ Reference:
134
+ - http://sox.sourceforge.net/sox.html
135
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
136
+ """
137
+ dtype = waveform.dtype
138
+ device = waveform.device
139
+ central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
140
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
141
+
142
+ w0 = 2 * math.pi * central_freq / sample_rate
143
+ bw_Hz = central_freq / Q
144
+
145
+ a0 = 1.0
146
+ a2 = torch.exp(-2 * math.pi * bw_Hz / sample_rate)
147
+ a1 = -4 * a2 / (1 + a2) * torch.cos(w0)
148
+
149
+ b0 = torch.sqrt(1 - a1 * a1 / (4 * a2)) * (1 - a2)
150
+
151
+ if noise:
152
+ mult = torch.sqrt(((1 + a2) * (1 + a2) - a1 * a1) * (1 - a2) / (1 + a2)) / b0
153
+ b0 = mult * b0
154
+
155
+ b1 = 0.0
156
+ b2 = 0.0
157
+
158
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
159
+
160
+
161
+ def bandpass_biquad(
162
+ waveform: Tensor,
163
+ sample_rate: int,
164
+ central_freq: float,
165
+ Q: float = 0.707,
166
+ const_skirt_gain: bool = False,
167
+ ) -> Tensor:
168
+ r"""Design two-pole band-pass filter. Similar to SoX implementation.
169
+
170
+ .. devices:: CPU CUDA
171
+
172
+ .. properties:: Autograd TorchScript
173
+
174
+ Args:
175
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
176
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
177
+ central_freq (float or torch.Tensor): central frequency (in Hz)
178
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
179
+ const_skirt_gain (bool, optional) : If ``True``, uses a constant skirt gain (peak gain = Q).
180
+ If ``False``, uses a constant 0dB peak gain. (Default: ``False``)
181
+
182
+ Returns:
183
+ Tensor: Waveform of dimension of `(..., time)`
184
+
185
+ Reference:
186
+ - http://sox.sourceforge.net/sox.html
187
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
188
+ """
189
+ dtype = waveform.dtype
190
+ device = waveform.device
191
+ central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
192
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
193
+
194
+ w0 = 2 * math.pi * central_freq / sample_rate
195
+ alpha = torch.sin(w0) / 2 / Q
196
+
197
+ temp = torch.sin(w0) / 2 if const_skirt_gain else alpha
198
+ b0 = temp
199
+ b1 = 0.0
200
+ b2 = -temp
201
+ a0 = 1 + alpha
202
+ a1 = -2 * torch.cos(w0)
203
+ a2 = 1 - alpha
204
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
205
+
206
+
207
+ def bandreject_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
208
+ r"""Design two-pole band-reject filter. Similar to SoX implementation.
209
+
210
+ .. devices:: CPU CUDA
211
+
212
+ .. properties:: Autograd TorchScript
213
+
214
+ Args:
215
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
216
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
217
+ central_freq (float or torch.Tensor): central frequency (in Hz)
218
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
219
+
220
+ Returns:
221
+ Tensor: Waveform of dimension of `(..., time)`
222
+
223
+ Reference:
224
+ - http://sox.sourceforge.net/sox.html
225
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
226
+ """
227
+ dtype = waveform.dtype
228
+ device = waveform.device
229
+ central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
230
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
231
+
232
+ w0 = 2 * math.pi * central_freq / sample_rate
233
+ alpha = torch.sin(w0) / 2 / Q
234
+
235
+ b0 = 1.0
236
+ b1 = -2 * torch.cos(w0)
237
+ b2 = 1.0
238
+ a0 = 1 + alpha
239
+ a1 = -2 * torch.cos(w0)
240
+ a2 = 1 - alpha
241
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
242
+
243
+
244
+ def bass_biquad(
245
+ waveform: Tensor,
246
+ sample_rate: int,
247
+ gain: float,
248
+ central_freq: float = 100,
249
+ Q: float = 0.707,
250
+ ) -> Tensor:
251
+ r"""Design a bass tone-control effect. Similar to SoX implementation.
252
+
253
+ .. devices:: CPU CUDA
254
+
255
+ .. properties:: Autograd TorchScript
256
+
257
+ Args:
258
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
259
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
260
+ gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB.
261
+ central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``100``)
262
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
263
+
264
+ Returns:
265
+ Tensor: Waveform of dimension of `(..., time)`
266
+
267
+ Reference:
268
+ - http://sox.sourceforge.net/sox.html
269
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
270
+ """
271
+ dtype = waveform.dtype
272
+ device = waveform.device
273
+ central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
274
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
275
+ gain = torch.as_tensor(gain, dtype=dtype, device=device)
276
+
277
+ w0 = 2 * math.pi * central_freq / sample_rate
278
+ alpha = torch.sin(w0) / 2 / Q
279
+ A = torch.exp(gain / 40 * math.log(10))
280
+
281
+ temp1 = 2 * torch.sqrt(A) * alpha
282
+ temp2 = (A - 1) * torch.cos(w0)
283
+ temp3 = (A + 1) * torch.cos(w0)
284
+
285
+ b0 = A * ((A + 1) - temp2 + temp1)
286
+ b1 = 2 * A * ((A - 1) - temp3)
287
+ b2 = A * ((A + 1) - temp2 - temp1)
288
+ a0 = (A + 1) + temp2 + temp1
289
+ a1 = -2 * ((A - 1) + temp3)
290
+ a2 = (A + 1) + temp2 - temp1
291
+
292
+ return biquad(waveform, b0 / a0, b1 / a0, b2 / a0, a0 / a0, a1 / a0, a2 / a0)
293
+
294
+
295
+ def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: float, a2: float) -> Tensor:
296
+ r"""Perform a biquad filter of input tensor. Initial conditions set to 0.
297
+
298
+ .. devices:: CPU CUDA
299
+
300
+ .. properties:: Autograd TorchScript
301
+
302
+ Args:
303
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
304
+ b0 (float or torch.Tensor): numerator coefficient of current input, x[n]
305
+ b1 (float or torch.Tensor): numerator coefficient of input one time step ago x[n-1]
306
+ b2 (float or torch.Tensor): numerator coefficient of input two time steps ago x[n-2]
307
+ a0 (float or torch.Tensor): denominator coefficient of current output y[n], typically 1
308
+ a1 (float or torch.Tensor): denominator coefficient of current output y[n-1]
309
+ a2 (float or torch.Tensor): denominator coefficient of current output y[n-2]
310
+
311
+ Returns:
312
+ Tensor: Waveform with dimension of `(..., time)`
313
+
314
+ Reference:
315
+ - https://en.wikipedia.org/wiki/Digital_biquad_filter
316
+ """
317
+
318
+ device = waveform.device
319
+ dtype = waveform.dtype
320
+
321
+ b0 = torch.as_tensor(b0, dtype=dtype, device=device).view(1)
322
+ b1 = torch.as_tensor(b1, dtype=dtype, device=device).view(1)
323
+ b2 = torch.as_tensor(b2, dtype=dtype, device=device).view(1)
324
+ a0 = torch.as_tensor(a0, dtype=dtype, device=device).view(1)
325
+ a1 = torch.as_tensor(a1, dtype=dtype, device=device).view(1)
326
+ a2 = torch.as_tensor(a2, dtype=dtype, device=device).view(1)
327
+
328
+ output_waveform = lfilter(
329
+ waveform,
330
+ torch.cat([a0, a1, a2]),
331
+ torch.cat([b0, b1, b2]),
332
+ )
333
+ return output_waveform
334
+
335
+
336
+ def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor:
337
+ r"""Apply contrast effect. Similar to SoX implementation.
338
+
339
+ .. devices:: CPU CUDA
340
+
341
+ .. properties:: Autograd TorchScript
342
+
343
+ Comparable with compression, this effect modifies an audio signal to make it sound louder
344
+
345
+ Args:
346
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
347
+ enhancement_amount (float, optional): controls the amount of the enhancement
348
+ Allowed range of values for enhancement_amount : 0-100
349
+ Note that enhancement_amount = 0 still gives a significant contrast enhancement
350
+
351
+ Returns:
352
+ Tensor: Waveform of dimension of `(..., time)`
353
+
354
+ Reference:
355
+ - http://sox.sourceforge.net/sox.html
356
+ """
357
+
358
+ if not 0 <= enhancement_amount <= 100:
359
+ raise ValueError("Allowed range of values for enhancement_amount : 0-100")
360
+
361
+ contrast = enhancement_amount / 750.0
362
+
363
+ temp1 = waveform * (math.pi / 2)
364
+ temp2 = contrast * torch.sin(temp1 * 4)
365
+ output_waveform = torch.sin(temp1 + temp2)
366
+
367
+ return output_waveform
368
+
369
+
370
+ def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None) -> Tensor:
371
+ r"""Apply a DC shift to the audio. Similar to SoX implementation.
372
+
373
+ .. devices:: CPU CUDA
374
+
375
+ .. properties:: TorchScript
376
+
377
+ This can be useful to remove a DC offset
378
+ (caused perhaps by a hardware problem in the recording chain) from the audio
379
+
380
+ Args:
381
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
382
+ shift (float): indicates the amount to shift the audio
383
+ Allowed range of values for shift : -2.0 to +2.0
384
+ limiter_gain (float of None, optional): It is used only on peaks to prevent clipping
385
+ It should have a value much less than 1 (e.g. 0.05 or 0.02)
386
+
387
+ Returns:
388
+ Tensor: Waveform of dimension of `(..., time)`
389
+
390
+ Reference:
391
+ - http://sox.sourceforge.net/sox.html
392
+ """
393
+ output_waveform = waveform
394
+ limiter_threshold = 0.0
395
+
396
+ if limiter_gain is not None:
397
+ limiter_threshold = 1.0 - (abs(shift) - limiter_gain)
398
+
399
+ # Note:
400
+ # the following index-based update breaks auto-grad support
401
+ if limiter_gain is not None and shift > 0:
402
+ mask = waveform > limiter_threshold
403
+ temp = (waveform[mask] - limiter_threshold) * limiter_gain / (1 - limiter_threshold)
404
+ output_waveform[mask] = (temp + limiter_threshold + shift).clamp(max=limiter_threshold)
405
+ output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1)
406
+ elif limiter_gain is not None and shift < 0:
407
+ mask = waveform < -limiter_threshold
408
+ temp = (waveform[mask] + limiter_threshold) * limiter_gain / (1 - limiter_threshold)
409
+ output_waveform[mask] = (temp - limiter_threshold + shift).clamp(min=-limiter_threshold)
410
+ output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1)
411
+ else:
412
+ output_waveform = (waveform + shift).clamp(min=-1, max=1)
413
+
414
+ return output_waveform
415
+
416
+
417
+ def deemph_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
418
+ r"""Apply ISO 908 CD de-emphasis (shelving) IIR filter. Similar to SoX implementation.
419
+
420
+ .. devices:: CPU CUDA
421
+
422
+ .. properties:: Autograd TorchScript
423
+
424
+ Args:
425
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
426
+ sample_rate (int): sampling rate of the waveform, Allowed sample rate ``44100`` or ``48000``
427
+
428
+ Returns:
429
+ Tensor: Waveform of dimension of `(..., time)`
430
+
431
+ Reference:
432
+ - http://sox.sourceforge.net/sox.html
433
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
434
+ """
435
+
436
+ if sample_rate == 44100:
437
+ central_freq = 5283
438
+ width_slope = 0.4845
439
+ gain = -9.477
440
+ elif sample_rate == 48000:
441
+ central_freq = 5356
442
+ width_slope = 0.479
443
+ gain = -9.62
444
+ else:
445
+ raise ValueError("Sample rate must be 44100 (audio-CD) or 48000 (DAT)")
446
+
447
+ w0 = 2 * math.pi * central_freq / sample_rate
448
+ A = math.exp(gain / 40.0 * math.log(10))
449
+ alpha = math.sin(w0) / 2 * math.sqrt((A + 1 / A) * (1 / width_slope - 1) + 2)
450
+
451
+ temp1 = 2 * math.sqrt(A) * alpha
452
+ temp2 = (A - 1) * math.cos(w0)
453
+ temp3 = (A + 1) * math.cos(w0)
454
+
455
+ b0 = A * ((A + 1) + temp2 + temp1)
456
+ b1 = -2 * A * ((A - 1) + temp3)
457
+ b2 = A * ((A + 1) + temp2 - temp1)
458
+ a0 = (A + 1) - temp2 + temp1
459
+ a1 = 2 * ((A - 1) - temp3)
460
+ a2 = (A + 1) - temp2 - temp1
461
+
462
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
463
+
464
+
465
+ def _add_noise_shaping(dithered_waveform: Tensor, waveform: Tensor) -> Tensor:
466
+ r"""Noise shaping is calculated by error:
467
+ error[n] = dithered[n] - original[n]
468
+ noise_shaped_waveform[n] = dithered[n] + error[n-1]
469
+ """
470
+ wf_shape = waveform.size()
471
+ waveform = waveform.reshape(-1, wf_shape[-1])
472
+
473
+ dithered_shape = dithered_waveform.size()
474
+ dithered_waveform = dithered_waveform.reshape(-1, dithered_shape[-1])
475
+
476
+ error = dithered_waveform - waveform
477
+
478
+ # add error[n-1] to dithered_waveform[n], so offset the error by 1 index
479
+ zeros = torch.zeros(1, dtype=error.dtype, device=error.device)
480
+ for index in range(error.size()[0]):
481
+ err = error[index]
482
+ error_offset = torch.cat((zeros, err))
483
+ error[index] = error_offset[: waveform.size()[1]]
484
+
485
+ noise_shaped = dithered_waveform + error
486
+ return noise_shaped.reshape(dithered_shape[:-1] + noise_shaped.shape[-1:])
487
+
488
+
489
+ def _apply_probability_distribution(waveform: Tensor, density_function: str = "TPDF") -> Tensor:
490
+ r"""Apply a probability distribution function on a waveform.
491
+
492
+ Triangular probability density function (TPDF) dither noise has a
493
+ triangular distribution; values in the center of the range have a higher
494
+ probability of occurring.
495
+
496
+ Rectangular probability density function (RPDF) dither noise has a
497
+ uniform distribution; any value in the specified range has the same
498
+ probability of occurring.
499
+
500
+ Gaussian probability density function (GPDF) has a normal distribution.
501
+ The relationship of probabilities of results follows a bell-shaped,
502
+ or Gaussian curve, typical of dither generated by analog sources.
503
+ Args:
504
+ waveform (Tensor): Tensor of audio of dimension (..., time)
505
+ density_function (str, optional): The density function of a
506
+ continuous random variable (Default: ``"TPDF"``)
507
+ Options: Triangular Probability Density Function - `TPDF`
508
+ Rectangular Probability Density Function - `RPDF`
509
+ Gaussian Probability Density Function - `GPDF`
510
+ Returns:
511
+ Tensor: waveform dithered with TPDF
512
+ """
513
+
514
+ # pack batch
515
+ shape = waveform.size()
516
+ waveform = waveform.reshape(-1, shape[-1])
517
+
518
+ channel_size = waveform.size()[0] - 1
519
+ time_size = waveform.size()[-1] - 1
520
+
521
+ random_channel = (
522
+ int(
523
+ torch.randint(
524
+ channel_size,
525
+ [
526
+ 1,
527
+ ],
528
+ ).item()
529
+ )
530
+ if channel_size > 0
531
+ else 0
532
+ )
533
+ random_time = (
534
+ int(
535
+ torch.randint(
536
+ time_size,
537
+ [
538
+ 1,
539
+ ],
540
+ ).item()
541
+ )
542
+ if time_size > 0
543
+ else 0
544
+ )
545
+
546
+ number_of_bits = 16
547
+ up_scaling = 2 ** (number_of_bits - 1) - 2
548
+ signal_scaled = waveform * up_scaling
549
+ down_scaling = 2 ** (number_of_bits - 1)
550
+
551
+ signal_scaled_dis = waveform
552
+ if density_function == "RPDF":
553
+ RPDF = waveform[random_channel][random_time] - 0.5
554
+
555
+ signal_scaled_dis = signal_scaled + RPDF
556
+ elif density_function == "GPDF":
557
+ # TODO Replace by distribution code once
558
+ # https://github.com/pytorch/pytorch/issues/29843 is resolved
559
+ # gaussian = torch.distributions.normal.Normal(torch.mean(waveform, -1), 1).sample()
560
+
561
+ num_rand_variables = 6
562
+
563
+ gaussian = waveform[random_channel][random_time]
564
+ for ws in num_rand_variables * [time_size]:
565
+ rand_chan = int(
566
+ torch.randint(
567
+ channel_size,
568
+ [
569
+ 1,
570
+ ],
571
+ ).item()
572
+ )
573
+ gaussian += waveform[rand_chan][
574
+ int(
575
+ torch.randint(
576
+ ws,
577
+ [
578
+ 1,
579
+ ],
580
+ ).item()
581
+ )
582
+ ]
583
+
584
+ signal_scaled_dis = signal_scaled + gaussian
585
+ else:
586
+ # dtype needed for https://github.com/pytorch/pytorch/issues/32358
587
+ TPDF = torch.bartlett_window(time_size + 1, dtype=signal_scaled.dtype, device=signal_scaled.device)
588
+ TPDF = TPDF.repeat((channel_size + 1), 1)
589
+ signal_scaled_dis = signal_scaled + TPDF
590
+
591
+ quantised_signal_scaled = torch.round(signal_scaled_dis)
592
+ quantised_signal = quantised_signal_scaled / down_scaling
593
+
594
+ # unpack batch
595
+ return quantised_signal.reshape(shape[:-1] + quantised_signal.shape[-1:])
596
+
597
+
598
+ def dither(waveform: Tensor, density_function: str = "TPDF", noise_shaping: bool = False) -> Tensor:
599
+ r"""Apply dither
600
+
601
+ .. devices:: CPU CUDA
602
+
603
+ .. properties:: TorchScript
604
+
605
+ Dither increases the perceived dynamic range of audio stored at a
606
+ particular bit-depth by eliminating nonlinear truncation distortion
607
+ (i.e. adding minimally perceived noise to mask distortion caused by quantization).
608
+
609
+ Args:
610
+ waveform (Tensor): Tensor of audio of dimension (..., time)
611
+ density_function (str, optional):
612
+ The density function of a continuous random variable. One of
613
+ ``"TPDF"`` (Triangular Probability Density Function),
614
+ ``"RPDF"`` (Rectangular Probability Density Function) or
615
+ ``"GPDF"`` (Gaussian Probability Density Function) (Default: ``"TPDF"``).
616
+ noise_shaping (bool, optional): a filtering process that shapes the spectral
617
+ energy of quantisation error (Default: ``False``)
618
+
619
+ Returns:
620
+ Tensor: waveform dithered
621
+ """
622
+ dithered = _apply_probability_distribution(waveform, density_function=density_function)
623
+
624
+ if noise_shaping:
625
+ return _add_noise_shaping(dithered, waveform)
626
+ else:
627
+ return dithered
628
+
629
+
630
+ def equalizer_biquad(
631
+ waveform: Tensor,
632
+ sample_rate: int,
633
+ center_freq: float,
634
+ gain: float,
635
+ Q: float = 0.707,
636
+ ) -> Tensor:
637
+ r"""Design biquad peaking equalizer filter and perform filtering. Similar to SoX implementation.
638
+
639
+ .. devices:: CPU CUDA
640
+
641
+ .. properties:: Autograd TorchScript
642
+
643
+ Args:
644
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
645
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
646
+ center_freq (float): filter's central frequency
647
+ gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB
648
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
649
+
650
+ Returns:
651
+ Tensor: Waveform of dimension of `(..., time)`
652
+ """
653
+ dtype = waveform.dtype
654
+ device = waveform.device
655
+ center_freq = torch.as_tensor(center_freq, dtype=dtype, device=device)
656
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
657
+ gain = torch.as_tensor(gain, dtype=dtype, device=device)
658
+
659
+ w0 = 2 * math.pi * center_freq / sample_rate
660
+ A = torch.exp(gain / 40.0 * math.log(10))
661
+ alpha = torch.sin(w0) / 2 / Q
662
+
663
+ b0 = 1 + alpha * A
664
+ b1 = -2 * torch.cos(w0)
665
+ b2 = 1 - alpha * A
666
+ a0 = 1 + alpha / A
667
+ a1 = -2 * torch.cos(w0)
668
+ a2 = 1 - alpha / A
669
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
670
+
671
+
672
+ def filtfilt(
673
+ waveform: Tensor,
674
+ a_coeffs: Tensor,
675
+ b_coeffs: Tensor,
676
+ clamp: bool = True,
677
+ ) -> Tensor:
678
+ r"""Apply an IIR filter forward and backward to a waveform.
679
+
680
+ .. devices:: CPU CUDA
681
+
682
+ .. properties:: Autograd TorchScript
683
+
684
+ Inspired by https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.filtfilt.html
685
+
686
+ Args:
687
+ waveform (Tensor): audio waveform of dimension of `(..., time)`. Must be normalized to -1 to 1.
688
+ a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either
689
+ 1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
690
+ Lower delay coefficients are first, e.g. ``[a0, a1, a2, ...]``.
691
+ Must be same size as b_coeffs (pad with 0's as necessary).
692
+ b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either
693
+ 1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
694
+ Lower delay coefficients are first, e.g. ``[b0, b1, b2, ...]``.
695
+ Must be same size as a_coeffs (pad with 0's as necessary).
696
+ clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
697
+
698
+ Returns:
699
+ Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
700
+ are 2D Tensors, or `(..., time)` otherwise.
701
+ """
702
+ forward_filtered = lfilter(waveform, a_coeffs, b_coeffs, clamp=False, batching=True)
703
+ backward_filtered = lfilter(
704
+ forward_filtered.flip(-1),
705
+ a_coeffs,
706
+ b_coeffs,
707
+ clamp=clamp,
708
+ batching=True,
709
+ ).flip(-1)
710
+ return backward_filtered
711
+
712
+
713
+ def flanger(
714
+ waveform: Tensor,
715
+ sample_rate: int,
716
+ delay: float = 0.0,
717
+ depth: float = 2.0,
718
+ regen: float = 0.0,
719
+ width: float = 71.0,
720
+ speed: float = 0.5,
721
+ phase: float = 25.0,
722
+ modulation: str = "sinusoidal",
723
+ interpolation: str = "linear",
724
+ ) -> Tensor:
725
+ r"""Apply a flanger effect to the audio. Similar to SoX implementation.
726
+
727
+ .. devices:: CPU CUDA
728
+
729
+ .. properties:: Autograd TorchScript
730
+
731
+ Args:
732
+ waveform (Tensor): audio waveform of dimension of `(..., channel, time)` .
733
+ Max 4 channels allowed
734
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
735
+ delay (float, optional): desired delay in milliseconds(ms)
736
+ Allowed range of values are 0 to 30
737
+ depth (float, optional): desired delay depth in milliseconds(ms)
738
+ Allowed range of values are 0 to 10
739
+ regen (float, optional): desired regen(feedback gain) in dB
740
+ Allowed range of values are -95 to 95
741
+ width (float, optional): desired width(delay gain) in dB
742
+ Allowed range of values are 0 to 100
743
+ speed (float, optional): modulation speed in Hz
744
+ Allowed range of values are 0.1 to 10
745
+ phase (float, optional): percentage phase-shift for multi-channel
746
+ Allowed range of values are 0 to 100
747
+ modulation (str, optional): Use either "sinusoidal" or "triangular" modulation. (Default: ``sinusoidal``)
748
+ interpolation (str, optional): Use either "linear" or "quadratic" for delay-line interpolation.
749
+ (Default: ``linear``)
750
+
751
+ Returns:
752
+ Tensor: Waveform of dimension of `(..., channel, time)`
753
+
754
+ Reference:
755
+ - http://sox.sourceforge.net/sox.html
756
+
757
+ - Scott Lehman, `Effects Explained`_,
758
+
759
+ .. _Effects Explained:
760
+ https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
761
+ """
762
+
763
+ if modulation not in ("sinusoidal", "triangular"):
764
+ raise ValueError('Only "sinusoidal" or "triangular" modulation allowed')
765
+
766
+ if interpolation not in ("linear", "quadratic"):
767
+ raise ValueError('Only "linear" or "quadratic" interpolation allowed')
768
+
769
+ actual_shape = waveform.shape
770
+ device, dtype = waveform.device, waveform.dtype
771
+
772
+ if actual_shape[-2] > 4:
773
+ raise ValueError("Max 4 channels allowed")
774
+
775
+ # convert to 3D (batch, channels, time)
776
+ waveform = waveform.view(-1, actual_shape[-2], actual_shape[-1])
777
+
778
+ # Scaling
779
+ feedback_gain = regen / 100
780
+ delay_gain = width / 100
781
+ channel_phase = phase / 100
782
+ delay_min = delay / 1000
783
+ delay_depth = depth / 1000
784
+
785
+ n_channels = waveform.shape[-2]
786
+
787
+ if modulation == "sinusoidal":
788
+ wave_type = "SINE"
789
+ else:
790
+ wave_type = "TRIANGLE"
791
+
792
+ # Balance output:
793
+ in_gain = 1.0 / (1 + delay_gain)
794
+ delay_gain = delay_gain / (1 + delay_gain)
795
+
796
+ # Balance feedback loop:
797
+ delay_gain = delay_gain * (1 - abs(feedback_gain))
798
+
799
+ delay_buf_length = int((delay_min + delay_depth) * sample_rate + 0.5)
800
+ delay_buf_length = delay_buf_length + 2
801
+
802
+ delay_bufs = torch.zeros(waveform.shape[0], n_channels, delay_buf_length, dtype=dtype, device=device)
803
+ delay_last = torch.zeros(waveform.shape[0], n_channels, dtype=dtype, device=device)
804
+
805
+ lfo_length = int(sample_rate / speed)
806
+
807
+ table_min = math.floor(delay_min * sample_rate + 0.5)
808
+ table_max = delay_buf_length - 2.0
809
+
810
+ lfo = _generate_wave_table(
811
+ wave_type=wave_type,
812
+ data_type="FLOAT",
813
+ table_size=lfo_length,
814
+ min=float(table_min),
815
+ max=float(table_max),
816
+ phase=3 * math.pi / 2,
817
+ device=device,
818
+ )
819
+
820
+ output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
821
+
822
+ delay_buf_pos = 0
823
+ lfo_pos = 0
824
+ channel_idxs = torch.arange(0, n_channels, device=device)
825
+
826
+ for i in range(waveform.shape[-1]):
827
+
828
+ delay_buf_pos = (delay_buf_pos + delay_buf_length - 1) % delay_buf_length
829
+
830
+ cur_channel_phase = (channel_idxs * lfo_length * channel_phase + 0.5).to(torch.int64)
831
+ delay_tensor = lfo[(lfo_pos + cur_channel_phase) % lfo_length]
832
+ frac_delay = torch.frac(delay_tensor)
833
+ delay_tensor = torch.floor(delay_tensor)
834
+
835
+ int_delay = delay_tensor.to(torch.int64)
836
+
837
+ temp = waveform[:, :, i]
838
+
839
+ delay_bufs[:, :, delay_buf_pos] = temp + delay_last * feedback_gain
840
+
841
+ delayed_0 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
842
+
843
+ int_delay = int_delay + 1
844
+
845
+ delayed_1 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
846
+
847
+ int_delay = int_delay + 1
848
+
849
+ if interpolation == "linear":
850
+ delayed = delayed_0 + (delayed_1 - delayed_0) * frac_delay
851
+ else:
852
+ delayed_2 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
853
+
854
+ int_delay = int_delay + 1
855
+
856
+ delayed_2 = delayed_2 - delayed_0
857
+ delayed_1 = delayed_1 - delayed_0
858
+ a = delayed_2 * 0.5 - delayed_1
859
+ b = delayed_1 * 2 - delayed_2 * 0.5
860
+
861
+ delayed = delayed_0 + (a * frac_delay + b) * frac_delay
862
+
863
+ delay_last = delayed
864
+ output_waveform[:, :, i] = waveform[:, :, i] * in_gain + delayed * delay_gain
865
+
866
+ lfo_pos = (lfo_pos + 1) % lfo_length
867
+
868
+ return output_waveform.clamp(min=-1, max=1).view(actual_shape)
869
+
870
+
871
+ def gain(waveform: Tensor, gain_db: float = 1.0) -> Tensor:
872
+ r"""Apply amplification or attenuation to the whole waveform.
873
+
874
+ .. devices:: CPU CUDA
875
+
876
+ .. properties:: Autograd TorchScript
877
+
878
+ Args:
879
+ waveform (Tensor): Tensor of audio of dimension (..., time).
880
+ gain_db (float, optional) Gain adjustment in decibels (dB) (Default: ``1.0``).
881
+
882
+ Returns:
883
+ Tensor: the whole waveform amplified by gain_db.
884
+ """
885
+ if gain_db == 0:
886
+ return waveform
887
+
888
+ ratio = 10 ** (gain_db / 20)
889
+
890
+ return waveform * ratio
891
+
892
+
893
+ def highpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
894
+ r"""Design biquad highpass filter and perform filtering. Similar to SoX implementation.
895
+
896
+ .. devices:: CPU CUDA
897
+
898
+ .. properties:: Autograd TorchScript
899
+
900
+ Args:
901
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
902
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
903
+ cutoff_freq (float or torch.Tensor): filter cutoff frequency
904
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
905
+
906
+ Returns:
907
+ Tensor: Waveform dimension of `(..., time)`
908
+ """
909
+ dtype = waveform.dtype
910
+ device = waveform.device
911
+ cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device)
912
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
913
+
914
+ w0 = 2 * math.pi * cutoff_freq / sample_rate
915
+ alpha = torch.sin(w0) / 2.0 / Q
916
+
917
+ b0 = (1 + torch.cos(w0)) / 2
918
+ b1 = -1 - torch.cos(w0)
919
+ b2 = b0
920
+ a0 = 1 + alpha
921
+ a1 = -2 * torch.cos(w0)
922
+ a2 = 1 - alpha
923
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
924
+
925
+
926
+ def _lfilter_core_generic_loop(input_signal_windows: Tensor, a_coeffs_flipped: Tensor, padded_output_waveform: Tensor):
927
+ n_order = a_coeffs_flipped.size(1)
928
+ a_coeffs_flipped = a_coeffs_flipped.unsqueeze(2)
929
+ for i_sample, o0 in enumerate(input_signal_windows.permute(2, 0, 1)):
930
+ windowed_output_signal = padded_output_waveform[:, :, i_sample : i_sample + n_order]
931
+ o0 -= (windowed_output_signal.transpose(0, 1) @ a_coeffs_flipped)[..., 0].t()
932
+ padded_output_waveform[:, :, i_sample + n_order - 1] = o0
933
+
934
+
935
+ if _IS_TORCHAUDIO_EXT_AVAILABLE:
936
+ _lfilter_core_loop = torch.ops.torchaudio._lfilter_core_loop
937
+ else:
938
+ _lfilter_core_loop = _lfilter_core_generic_loop
939
+
940
+
941
+ class DifferentiableFIR(torch.autograd.Function):
942
+ @staticmethod
943
+ def forward(ctx, waveform, b_coeffs):
944
+ n_order = b_coeffs.size(1)
945
+ n_channel = b_coeffs.size(0)
946
+ b_coeff_flipped = b_coeffs.flip(1).contiguous()
947
+ padded_waveform = F.pad(waveform, (n_order - 1, 0))
948
+ output = F.conv1d(padded_waveform, b_coeff_flipped.unsqueeze(1), groups=n_channel)
949
+ ctx.save_for_backward(waveform, b_coeffs, output)
950
+ return output
951
+
952
+ @staticmethod
953
+ def backward(ctx, dy):
954
+ x, b_coeffs, y = ctx.saved_tensors
955
+ n_batch = x.size(0)
956
+ n_channel = x.size(1)
957
+ n_order = b_coeffs.size(1)
958
+ db = (
959
+ F.conv1d(
960
+ F.pad(x, (n_order - 1, 0)).view(1, n_batch * n_channel, -1),
961
+ dy.view(n_batch * n_channel, 1, -1),
962
+ groups=n_batch * n_channel,
963
+ )
964
+ .view(n_batch, n_channel, -1)
965
+ .sum(0)
966
+ .flip(1)
967
+ if b_coeffs.requires_grad
968
+ else None
969
+ )
970
+ dx = F.conv1d(F.pad(dy, (0, n_order - 1)), b_coeffs.unsqueeze(1), groups=n_channel) if x.requires_grad else None
971
+ return (dx, db)
972
+
973
+
974
+ class DifferentiableIIR(torch.autograd.Function):
975
+ @staticmethod
976
+ def forward(ctx, waveform, a_coeffs_normalized):
977
+ n_batch, n_channel, n_sample = waveform.shape
978
+ n_order = a_coeffs_normalized.size(1)
979
+ n_sample_padded = n_sample + n_order - 1
980
+
981
+ a_coeff_flipped = a_coeffs_normalized.flip(1).contiguous()
982
+ padded_output_waveform = torch.zeros(
983
+ n_batch, n_channel, n_sample_padded, device=waveform.device, dtype=waveform.dtype
984
+ )
985
+ _lfilter_core_loop(waveform, a_coeff_flipped, padded_output_waveform)
986
+ output = padded_output_waveform[:, :, n_order - 1 :]
987
+ ctx.save_for_backward(waveform, a_coeffs_normalized, output)
988
+ return output
989
+
990
+ @staticmethod
991
+ def backward(ctx, dy):
992
+ x, a_coeffs_normalized, y = ctx.saved_tensors
993
+ n_channel = x.size(1)
994
+ n_order = a_coeffs_normalized.size(1)
995
+ tmp = DifferentiableIIR.apply(dy.flip(2).contiguous(), a_coeffs_normalized).flip(2)
996
+ dx = tmp if x.requires_grad else None
997
+ da = (
998
+ -(
999
+ tmp.transpose(0, 1).reshape(n_channel, 1, -1)
1000
+ @ F.pad(y, (n_order - 1, 0)).unfold(2, n_order, 1).transpose(0, 1).reshape(n_channel, -1, n_order)
1001
+ )
1002
+ .squeeze(1)
1003
+ .flip(1)
1004
+ if a_coeffs_normalized.requires_grad
1005
+ else None
1006
+ )
1007
+ return (dx, da)
1008
+
1009
+
1010
+ def _lfilter(waveform, a_coeffs, b_coeffs):
1011
+ filtered_waveform = DifferentiableFIR.apply(waveform, b_coeffs / a_coeffs[:, 0:1])
1012
+ return DifferentiableIIR.apply(filtered_waveform, a_coeffs / a_coeffs[:, 0:1])
1013
+
1014
+
1015
+ def lfilter(waveform: Tensor, a_coeffs: Tensor, b_coeffs: Tensor, clamp: bool = True, batching: bool = True) -> Tensor:
1016
+ r"""Perform an IIR filter by evaluating difference equation, using differentiable implementation
1017
+ developed separately by *Yu et al.* :cite:`ismir_YuF23` and *Forgione et al.* :cite:`forgione2021dynonet`.
1018
+ The gradients of ``a_coeffs`` are computed based on a faster algorithm from :cite:`ycy2024diffapf`.
1019
+
1020
+ .. devices:: CPU CUDA
1021
+
1022
+ .. properties:: Autograd TorchScript
1023
+
1024
+ Note:
1025
+ To avoid numerical problems, small filter order is preferred.
1026
+ Using double precision could also minimize numerical precision errors.
1027
+
1028
+ Args:
1029
+ waveform (Tensor): audio waveform of dimension of `(..., time)`. Must be normalized to -1 to 1.
1030
+ a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either
1031
+ 1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
1032
+ Lower delays coefficients are first, e.g. ``[a0, a1, a2, ...]``.
1033
+ Must be same size as b_coeffs (pad with 0's as necessary).
1034
+ b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either
1035
+ 1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
1036
+ Lower delays coefficients are first, e.g. ``[b0, b1, b2, ...]``.
1037
+ Must be same size as a_coeffs (pad with 0's as necessary).
1038
+ clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
1039
+ batching (bool, optional): Effective only when coefficients are 2D. If ``True``, then waveform should be at
1040
+ least 2D, and the size of second axis from last should equals to ``num_filters``.
1041
+ The output can be expressed as ``output[..., i, :] = lfilter(waveform[..., i, :],
1042
+ a_coeffs[i], b_coeffs[i], clamp=clamp, batching=False)``. (Default: ``True``)
1043
+
1044
+ Returns:
1045
+ Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
1046
+ are 2D Tensors, or `(..., time)` otherwise.
1047
+ """
1048
+ if a_coeffs.size() != b_coeffs.size():
1049
+ raise ValueError(
1050
+ "Expected coeffs to be the same size."
1051
+ f"Found: a_coeffs size: {a_coeffs.size()}, b_coeffs size: {b_coeffs.size()}"
1052
+ )
1053
+ if a_coeffs.ndim > 2:
1054
+ raise ValueError(f"Expected coeffs to have greater than 1 dimension. Found: {a_coeffs.ndim}")
1055
+
1056
+ if a_coeffs.ndim > 1:
1057
+ if batching:
1058
+ if waveform.ndim <= 0:
1059
+ raise ValueError("Expected waveform to have a positive number of dimensions." f"Found: {waveform.ndim}")
1060
+ if waveform.shape[-2] != a_coeffs.shape[0]:
1061
+ raise ValueError(
1062
+ "Expected number of batches in waveform and coeffs to be the same."
1063
+ f"Found: coeffs batches: {a_coeffs.shape[0]}, waveform batches: {waveform.shape[-2]}"
1064
+ )
1065
+ else:
1066
+ waveform = torch.stack([waveform] * a_coeffs.shape[0], -2)
1067
+ else:
1068
+ a_coeffs = a_coeffs.unsqueeze(0)
1069
+ b_coeffs = b_coeffs.unsqueeze(0)
1070
+
1071
+ # pack batch
1072
+ shape = waveform.size()
1073
+ waveform = waveform.reshape(-1, a_coeffs.shape[0], shape[-1])
1074
+ output = _lfilter(waveform, a_coeffs, b_coeffs)
1075
+
1076
+ if clamp:
1077
+ output = torch.clamp(output, min=-1.0, max=1.0)
1078
+
1079
+ # unpack batch
1080
+ output = output.reshape(shape[:-1] + output.shape[-1:])
1081
+
1082
+ return output
1083
+
1084
+
1085
+ def lowpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
1086
+ r"""Design biquad lowpass filter and perform filtering. Similar to SoX implementation.
1087
+
1088
+ .. devices:: CPU CUDA
1089
+
1090
+ .. properties:: Autograd TorchScript
1091
+
1092
+ Args:
1093
+ waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
1094
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
1095
+ cutoff_freq (float or torch.Tensor): filter cutoff frequency
1096
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
1097
+
1098
+ Returns:
1099
+ Tensor: Waveform of dimension of `(..., time)`
1100
+ """
1101
+ dtype = waveform.dtype
1102
+ device = waveform.device
1103
+ cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device)
1104
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
1105
+
1106
+ w0 = 2 * math.pi * cutoff_freq / sample_rate
1107
+ alpha = torch.sin(w0) / 2 / Q
1108
+
1109
+ b0 = (1 - torch.cos(w0)) / 2
1110
+ b1 = 1 - torch.cos(w0)
1111
+ b2 = b0
1112
+ a0 = 1 + alpha
1113
+ a1 = -2 * torch.cos(w0)
1114
+ a2 = 1 - alpha
1115
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
1116
+
1117
+
1118
+ def _overdrive_core_loop_generic(
1119
+ waveform: Tensor, temp: Tensor, last_in: Tensor, last_out: Tensor, output_waveform: Tensor
1120
+ ):
1121
+ for i in range(waveform.shape[-1]):
1122
+ last_out = temp[:, i] - last_in + 0.995 * last_out
1123
+ last_in = temp[:, i]
1124
+ output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
1125
+
1126
+
1127
+ if _IS_TORCHAUDIO_EXT_AVAILABLE:
1128
+ _overdrive_core_loop_cpu = torch.ops.torchaudio._overdrive_core_loop
1129
+ else:
1130
+ _overdrive_core_loop_cpu = _overdrive_core_loop_generic
1131
+
1132
+
1133
+ def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
1134
+ r"""Apply a overdrive effect to the audio. Similar to SoX implementation.
1135
+
1136
+ .. devices:: CPU CUDA
1137
+
1138
+ .. properties:: Autograd TorchScript
1139
+
1140
+ This effect applies a non linear distortion to the audio signal.
1141
+
1142
+ Args:
1143
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
1144
+ gain (float, optional): desired gain at the boost (or attenuation) in dB
1145
+ Allowed range of values are 0 to 100
1146
+ colour (float, optional): controls the amount of even harmonic content in the over-driven output
1147
+ Allowed range of values are 0 to 100
1148
+
1149
+ Returns:
1150
+ Tensor: Waveform of dimension of `(..., time)`
1151
+
1152
+ Reference:
1153
+ - http://sox.sourceforge.net/sox.html
1154
+ """
1155
+ actual_shape = waveform.shape
1156
+ device, dtype = waveform.device, waveform.dtype
1157
+
1158
+ # convert to 2D (..,time)
1159
+ waveform = waveform.view(-1, actual_shape[-1])
1160
+
1161
+ gain = _dB2Linear(gain)
1162
+ colour = colour / 200
1163
+ last_in = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device)
1164
+ last_out = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device)
1165
+
1166
+ temp = waveform * gain + colour
1167
+
1168
+ mask1 = temp < -1
1169
+ temp[mask1] = torch.tensor(-2.0 / 3.0, dtype=dtype, device=device)
1170
+ # Wrapping the constant with Tensor is required for Torchscript
1171
+
1172
+ mask2 = temp > 1
1173
+ temp[mask2] = torch.tensor(2.0 / 3.0, dtype=dtype, device=device)
1174
+
1175
+ mask3 = ~mask1 & ~mask2
1176
+ temp[mask3] = temp[mask3] - (temp[mask3] ** 3) * (1.0 / 3)
1177
+
1178
+ output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
1179
+
1180
+ # Uses CPU optimized loop function if available for CPU device
1181
+ if device == torch.device("cpu"):
1182
+ _overdrive_core_loop_cpu(waveform, temp, last_in, last_out, output_waveform)
1183
+ else:
1184
+ _overdrive_core_loop_generic(waveform, temp, last_in, last_out, output_waveform)
1185
+
1186
+ return output_waveform.clamp(min=-1, max=1).view(actual_shape)
1187
+
1188
+
1189
+ def phaser(
1190
+ waveform: Tensor,
1191
+ sample_rate: int,
1192
+ gain_in: float = 0.4,
1193
+ gain_out: float = 0.74,
1194
+ delay_ms: float = 3.0,
1195
+ decay: float = 0.4,
1196
+ mod_speed: float = 0.5,
1197
+ sinusoidal: bool = True,
1198
+ ) -> Tensor:
1199
+ r"""Apply a phasing effect to the audio. Similar to SoX implementation.
1200
+
1201
+ .. devices:: CPU CUDA
1202
+
1203
+ .. properties:: Autograd TorchScript
1204
+
1205
+ Args:
1206
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
1207
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
1208
+ gain_in (float, optional): desired input gain at the boost (or attenuation) in dB
1209
+ Allowed range of values are 0 to 1
1210
+ gain_out (float, optional): desired output gain at the boost (or attenuation) in dB
1211
+ Allowed range of values are 0 to 1e9
1212
+ delay_ms (float, optional): desired delay in milliseconds
1213
+ Allowed range of values are 0 to 5.0
1214
+ decay (float, optional): desired decay relative to gain-in
1215
+ Allowed range of values are 0 to 0.99
1216
+ mod_speed (float, optional): modulation speed in Hz
1217
+ Allowed range of values are 0.1 to 2
1218
+ sinusoidal (bool, optional): If ``True``, uses sinusoidal modulation (preferable for multiple instruments)
1219
+ If ``False``, uses triangular modulation (gives single instruments a sharper phasing effect)
1220
+ (Default: ``True``)
1221
+
1222
+ Returns:
1223
+ Tensor: Waveform of dimension of `(..., time)`
1224
+
1225
+ Reference:
1226
+ - http://sox.sourceforge.net/sox.html
1227
+ - Scott Lehman, `Effects Explained`_.
1228
+
1229
+ .. _Effects Explained:
1230
+ https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
1231
+ """
1232
+ actual_shape = waveform.shape
1233
+ device, dtype = waveform.device, waveform.dtype
1234
+
1235
+ # convert to 2D (channels,time)
1236
+ waveform = waveform.view(-1, actual_shape[-1])
1237
+
1238
+ delay_buf_len = int((delay_ms * 0.001 * sample_rate) + 0.5)
1239
+ delay_buf = torch.zeros(waveform.shape[0], delay_buf_len, dtype=dtype, device=device)
1240
+
1241
+ mod_buf_len = int(sample_rate / mod_speed + 0.5)
1242
+
1243
+ if sinusoidal:
1244
+ wave_type = "SINE"
1245
+ else:
1246
+ wave_type = "TRIANGLE"
1247
+
1248
+ mod_buf = _generate_wave_table(
1249
+ wave_type=wave_type,
1250
+ data_type="INT",
1251
+ table_size=mod_buf_len,
1252
+ min=1.0,
1253
+ max=float(delay_buf_len),
1254
+ phase=math.pi / 2,
1255
+ device=device,
1256
+ )
1257
+
1258
+ delay_pos = 0
1259
+ mod_pos = 0
1260
+
1261
+ output_waveform_pre_gain_list = []
1262
+ waveform = waveform * gain_in
1263
+ delay_buf = delay_buf * decay
1264
+ waveform_list = [waveform[:, i] for i in range(waveform.size(1))]
1265
+ delay_buf_list = [delay_buf[:, i] for i in range(delay_buf.size(1))]
1266
+ mod_buf_list = [mod_buf[i] for i in range(mod_buf.size(0))]
1267
+
1268
+ for i in range(waveform.shape[-1]):
1269
+ idx = int((delay_pos + mod_buf_list[mod_pos]) % delay_buf_len)
1270
+ mod_pos = (mod_pos + 1) % mod_buf_len
1271
+ delay_pos = (delay_pos + 1) % delay_buf_len
1272
+ temp = (waveform_list[i]) + (delay_buf_list[idx])
1273
+ delay_buf_list[delay_pos] = temp * decay
1274
+ output_waveform_pre_gain_list.append(temp)
1275
+
1276
+ output_waveform = torch.stack(output_waveform_pre_gain_list, dim=1).to(dtype=dtype, device=device)
1277
+ output_waveform.mul_(gain_out)
1278
+
1279
+ return output_waveform.clamp(min=-1, max=1).view(actual_shape)
1280
+
1281
+
1282
+ def riaa_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
1283
+ r"""Apply RIAA vinyl playback equalization. Similar to SoX implementation.
1284
+
1285
+ .. devices:: CPU CUDA
1286
+
1287
+ .. properties:: Autograd TorchScript
1288
+
1289
+ Args:
1290
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
1291
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz).
1292
+ Allowed sample rates in Hz : ``44100``,``48000``,``88200``,``96000``
1293
+
1294
+ Returns:
1295
+ Tensor: Waveform of dimension of `(..., time)`
1296
+
1297
+ Reference:
1298
+ - http://sox.sourceforge.net/sox.html
1299
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
1300
+ """
1301
+
1302
+ if sample_rate == 44100:
1303
+ zeros = [-0.2014898, 0.9233820]
1304
+ poles = [0.7083149, 0.9924091]
1305
+
1306
+ elif sample_rate == 48000:
1307
+ zeros = [-0.1766069, 0.9321590]
1308
+ poles = [0.7396325, 0.9931330]
1309
+
1310
+ elif sample_rate == 88200:
1311
+ zeros = [-0.1168735, 0.9648312]
1312
+ poles = [0.8590646, 0.9964002]
1313
+
1314
+ elif sample_rate == 96000:
1315
+ zeros = [-0.1141486, 0.9676817]
1316
+ poles = [0.8699137, 0.9966946]
1317
+
1318
+ else:
1319
+ raise ValueError("Sample rate must be 44.1k, 48k, 88.2k, or 96k")
1320
+
1321
+ # polynomial coefficients with roots zeros[0] and zeros[1]
1322
+ b0 = 1.0
1323
+ b1 = -(zeros[0] + zeros[1])
1324
+ b2 = zeros[0] * zeros[1]
1325
+
1326
+ # polynomial coefficients with roots poles[0] and poles[1]
1327
+ a0 = 1.0
1328
+ a1 = -(poles[0] + poles[1])
1329
+ a2 = poles[0] * poles[1]
1330
+
1331
+ # Normalize to 0dB at 1kHz
1332
+ y = 2 * math.pi * 1000 / sample_rate
1333
+ b_re = b0 + b1 * math.cos(-y) + b2 * math.cos(-2 * y)
1334
+ a_re = a0 + a1 * math.cos(-y) + a2 * math.cos(-2 * y)
1335
+ b_im = b1 * math.sin(-y) + b2 * math.sin(-2 * y)
1336
+ a_im = a1 * math.sin(-y) + a2 * math.sin(-2 * y)
1337
+ g = 1 / math.sqrt((b_re**2 + b_im**2) / (a_re**2 + a_im**2))
1338
+
1339
+ b0 *= g
1340
+ b1 *= g
1341
+ b2 *= g
1342
+
1343
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
1344
+
1345
+
1346
+ def treble_biquad(
1347
+ waveform: Tensor,
1348
+ sample_rate: int,
1349
+ gain: float,
1350
+ central_freq: float = 3000,
1351
+ Q: float = 0.707,
1352
+ ) -> Tensor:
1353
+ r"""Design a treble tone-control effect. Similar to SoX implementation.
1354
+
1355
+ .. devices:: CPU CUDA
1356
+
1357
+ .. properties:: Autograd TorchScript
1358
+
1359
+ Args:
1360
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
1361
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
1362
+ gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB.
1363
+ central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``3000``)
1364
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
1365
+
1366
+ Returns:
1367
+ Tensor: Waveform of dimension of `(..., time)`
1368
+
1369
+ Reference:
1370
+ - http://sox.sourceforge.net/sox.html
1371
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
1372
+ """
1373
+ dtype = waveform.dtype
1374
+ device = waveform.device
1375
+ central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
1376
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
1377
+ gain = torch.as_tensor(gain, dtype=dtype, device=device)
1378
+
1379
+ w0 = 2 * math.pi * central_freq / sample_rate
1380
+ alpha = torch.sin(w0) / 2 / Q
1381
+ A = torch.exp(gain / 40 * math.log(10))
1382
+
1383
+ temp1 = 2 * torch.sqrt(A) * alpha
1384
+ temp2 = (A - 1) * torch.cos(w0)
1385
+ temp3 = (A + 1) * torch.cos(w0)
1386
+
1387
+ b0 = A * ((A + 1) + temp2 + temp1)
1388
+ b1 = -2 * A * ((A - 1) + temp3)
1389
+ b2 = A * ((A + 1) + temp2 - temp1)
1390
+ a0 = (A + 1) - temp2 + temp1
1391
+ a1 = 2 * ((A - 1) - temp3)
1392
+ a2 = (A + 1) - temp2 - temp1
1393
+
1394
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
1395
+
1396
+
1397
+ def _measure(
1398
+ measure_len_ws: int,
1399
+ samples: Tensor,
1400
+ spectrum: Tensor,
1401
+ noise_spectrum: Tensor,
1402
+ spectrum_window: Tensor,
1403
+ spectrum_start: int,
1404
+ spectrum_end: int,
1405
+ cepstrum_window: Tensor,
1406
+ cepstrum_start: int,
1407
+ cepstrum_end: int,
1408
+ noise_reduction_amount: float,
1409
+ measure_smooth_time_mult: float,
1410
+ noise_up_time_mult: Tensor,
1411
+ noise_down_time_mult: Tensor,
1412
+ boot_count: int,
1413
+ ) -> float:
1414
+ device = samples.device
1415
+
1416
+ if spectrum.size(-1) != noise_spectrum.size(-1):
1417
+ raise ValueError(
1418
+ "Expected spectrum size to match noise spectrum size in final dimension."
1419
+ f"Found: spectrum size: {spectrum.size()}, noise_spectrum size: {noise_spectrum.size()}"
1420
+ )
1421
+
1422
+ dft_len_ws = spectrum.size()[-1]
1423
+
1424
+ dftBuf = torch.zeros(dft_len_ws, device=device)
1425
+
1426
+ dftBuf[:measure_len_ws] = samples * spectrum_window[:measure_len_ws]
1427
+
1428
+ # lsx_safe_rdft((int)p->dft_len_ws, 1, c->dftBuf);
1429
+ _dftBuf = torch.fft.rfft(dftBuf)
1430
+
1431
+ mult: float = boot_count / (1.0 + boot_count) if boot_count >= 0 else measure_smooth_time_mult
1432
+
1433
+ _d = _dftBuf[spectrum_start:spectrum_end].abs()
1434
+ spectrum[spectrum_start:spectrum_end].mul_(mult).add_(_d * (1 - mult))
1435
+ _d = spectrum[spectrum_start:spectrum_end] ** 2
1436
+
1437
+ _zeros = torch.zeros(spectrum_end - spectrum_start, device=device)
1438
+ _mult = (
1439
+ _zeros
1440
+ if boot_count >= 0
1441
+ else torch.where(
1442
+ _d > noise_spectrum[spectrum_start:spectrum_end],
1443
+ noise_up_time_mult, # if
1444
+ noise_down_time_mult, # else,
1445
+ )
1446
+ )
1447
+
1448
+ noise_spectrum[spectrum_start:spectrum_end].mul_(_mult).add_(_d * (1 - _mult))
1449
+ _d = torch.sqrt(
1450
+ torch.max(
1451
+ _zeros,
1452
+ _d - noise_reduction_amount * noise_spectrum[spectrum_start:spectrum_end],
1453
+ ),
1454
+ )
1455
+
1456
+ _cepstrum_Buf: Tensor = torch.zeros(dft_len_ws >> 1, device=device)
1457
+ _cepstrum_Buf[spectrum_start:spectrum_end] = _d * cepstrum_window
1458
+ _cepstrum_Buf[spectrum_end : dft_len_ws >> 1].zero_()
1459
+
1460
+ # lsx_safe_rdft((int)p->dft_len_ws >> 1, 1, c->dftBuf);
1461
+ _cepstrum_Buf = torch.fft.rfft(_cepstrum_Buf)
1462
+
1463
+ result: float = float(torch.sum(_cepstrum_Buf[cepstrum_start:cepstrum_end].abs().pow(2)))
1464
+ result = math.log(result / (cepstrum_end - cepstrum_start)) if result > 0 else -math.inf
1465
+ return max(0, 21 + result)
1466
+
1467
+
1468
+ def vad(
1469
+ waveform: Tensor,
1470
+ sample_rate: int,
1471
+ trigger_level: float = 7.0,
1472
+ trigger_time: float = 0.25,
1473
+ search_time: float = 1.0,
1474
+ allowed_gap: float = 0.25,
1475
+ pre_trigger_time: float = 0.0,
1476
+ # Fine-tuning parameters
1477
+ boot_time: float = 0.35,
1478
+ noise_up_time: float = 0.1,
1479
+ noise_down_time: float = 0.01,
1480
+ noise_reduction_amount: float = 1.35,
1481
+ measure_freq: float = 20.0,
1482
+ measure_duration: Optional[float] = None,
1483
+ measure_smooth_time: float = 0.4,
1484
+ hp_filter_freq: float = 50.0,
1485
+ lp_filter_freq: float = 6000.0,
1486
+ hp_lifter_freq: float = 150.0,
1487
+ lp_lifter_freq: float = 2000.0,
1488
+ ) -> Tensor:
1489
+ r"""Voice Activity Detector. Similar to SoX implementation.
1490
+
1491
+ .. devices:: CPU CUDA
1492
+
1493
+ .. properties:: TorchScript
1494
+
1495
+ Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
1496
+ The algorithm currently uses a simple cepstral power measurement to detect voice,
1497
+ so may be fooled by other things, especially music.
1498
+
1499
+ The effect can trim only from the front of the audio,
1500
+ so in order to trim from the back, the reverse effect must also be used.
1501
+
1502
+ Args:
1503
+ waveform (Tensor): Tensor of audio of dimension `(channels, time)` or `(time)`
1504
+ Tensor of shape `(channels, time)` is treated as a multi-channel recording
1505
+ of the same event and the resulting output will be trimmed to the earliest
1506
+ voice activity in any channel.
1507
+ sample_rate (int): Sample rate of audio signal.
1508
+ trigger_level (float, optional): The measurement level used to trigger activity detection.
1509
+ This may need to be cahnged depending on the noise level, signal level,
1510
+ and other characteristics of the input audio. (Default: 7.0)
1511
+ trigger_time (float, optional): The time constant (in seconds)
1512
+ used to help ignore short bursts of sound. (Default: 0.25)
1513
+ search_time (float, optional): The amount of audio (in seconds)
1514
+ to search for quieter/shorter bursts of audio to include prior
1515
+ to the detected trigger point. (Default: 1.0)
1516
+ allowed_gap (float, optional): The allowed gap (in seconds) between
1517
+ quieter/shorter bursts of audio to include prior
1518
+ to the detected trigger point. (Default: 0.25)
1519
+ pre_trigger_time (float, optional): The amount of audio (in seconds) to preserve
1520
+ before the trigger point and any found quieter/shorter bursts. (Default: 0.0)
1521
+ boot_time (float, optional) The algorithm (internally) uses adaptive noise
1522
+ estimation/reduction in order to detect the start of the wanted audio.
1523
+ This option sets the time for the initial noise estimate. (Default: 0.35)
1524
+ noise_up_time (float, optional) Time constant used by the adaptive noise estimator
1525
+ for when the noise level is increasing. (Default: 0.1)
1526
+ noise_down_time (float, optional) Time constant used by the adaptive noise estimator
1527
+ for when the noise level is decreasing. (Default: 0.01)
1528
+ noise_reduction_amount (float, optional) Amount of noise reduction to use in
1529
+ the detection algorithm (e.g. 0, 0.5, ...). (Default: 1.35)
1530
+ measure_freq (float, optional) Frequency of the algorithm's
1531
+ processing/measurements. (Default: 20.0)
1532
+ measure_duration: (float, optional) Measurement duration.
1533
+ (Default: Twice the measurement period; i.e. with overlap.)
1534
+ measure_smooth_time (float, optional) Time constant used to smooth
1535
+ spectral measurements. (Default: 0.4)
1536
+ hp_filter_freq (float, optional) "Brick-wall" frequency of high-pass filter applied
1537
+ at the input to the detector algorithm. (Default: 50.0)
1538
+ lp_filter_freq (float, optional) "Brick-wall" frequency of low-pass filter applied
1539
+ at the input to the detector algorithm. (Default: 6000.0)
1540
+ hp_lifter_freq (float, optional) "Brick-wall" frequency of high-pass lifter used
1541
+ in the detector algorithm. (Default: 150.0)
1542
+ lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used
1543
+ in the detector algorithm. (Default: 2000.0)
1544
+
1545
+ Returns:
1546
+ Tensor: Tensor of audio of dimension `(..., time)`.
1547
+
1548
+ Reference:
1549
+ - http://sox.sourceforge.net/sox.html
1550
+ """
1551
+ device = waveform.device
1552
+
1553
+ if waveform.ndim > 2:
1554
+ warnings.warn(
1555
+ "Expected input tensor dimension of 1 for single channel"
1556
+ f" or 2 for multi-channel. Got {waveform.ndim} instead. "
1557
+ "Batch semantics is not supported. "
1558
+ "Please refer to https://github.com/pytorch/audio/issues/1348"
1559
+ " and https://github.com/pytorch/audio/issues/1468."
1560
+ )
1561
+
1562
+ measure_duration: float = 2.0 / measure_freq if measure_duration is None else measure_duration
1563
+
1564
+ measure_len_ws = int(sample_rate * measure_duration + 0.5)
1565
+ measure_len_ns = measure_len_ws
1566
+ # for (dft_len_ws = 16; dft_len_ws < measure_len_ws; dft_len_ws <<= 1);
1567
+ dft_len_ws = 16
1568
+ while dft_len_ws < measure_len_ws:
1569
+ dft_len_ws *= 2
1570
+
1571
+ measure_period_ns = int(sample_rate / measure_freq + 0.5)
1572
+ measures_len = math.ceil(search_time * measure_freq)
1573
+ search_pre_trigger_len_ns = measures_len * measure_period_ns
1574
+ gap_len = int(allowed_gap * measure_freq + 0.5)
1575
+
1576
+ fixed_pre_trigger_len_ns = int(pre_trigger_time * sample_rate + 0.5)
1577
+ samplesLen_ns = fixed_pre_trigger_len_ns + search_pre_trigger_len_ns + measure_len_ns
1578
+
1579
+ spectrum_window = torch.zeros(measure_len_ws, device=device)
1580
+ for i in range(measure_len_ws):
1581
+ # sox.h:741 define SOX_SAMPLE_MIN (sox_sample_t)SOX_INT_MIN(32)
1582
+ spectrum_window[i] = 2.0 / math.sqrt(float(measure_len_ws))
1583
+ # lsx_apply_hann(spectrum_window, (int)measure_len_ws);
1584
+ spectrum_window *= torch.hann_window(measure_len_ws, device=device, dtype=torch.float)
1585
+
1586
+ spectrum_start: int = int(hp_filter_freq / sample_rate * dft_len_ws + 0.5)
1587
+ spectrum_start: int = max(spectrum_start, 1)
1588
+ spectrum_end: int = int(lp_filter_freq / sample_rate * dft_len_ws + 0.5)
1589
+ spectrum_end: int = min(spectrum_end, dft_len_ws // 2)
1590
+
1591
+ cepstrum_window = torch.zeros(spectrum_end - spectrum_start, device=device)
1592
+ for i in range(spectrum_end - spectrum_start):
1593
+ cepstrum_window[i] = 2.0 / math.sqrt(float(spectrum_end) - spectrum_start)
1594
+ # lsx_apply_hann(cepstrum_window,(int)(spectrum_end - spectrum_start));
1595
+ cepstrum_window *= torch.hann_window(spectrum_end - spectrum_start, device=device, dtype=torch.float)
1596
+
1597
+ cepstrum_start = math.ceil(sample_rate * 0.5 / lp_lifter_freq)
1598
+ cepstrum_end = math.floor(sample_rate * 0.5 / hp_lifter_freq)
1599
+ cepstrum_end = min(cepstrum_end, dft_len_ws // 4)
1600
+
1601
+ if cepstrum_end <= cepstrum_start:
1602
+ raise ValueError(
1603
+ "Expected cepstrum_start to be smaller than cepstrum_end."
1604
+ f"Found: cepstrum_start: {cepstrum_start}, cepstrum_end: {cepstrum_end}."
1605
+ )
1606
+
1607
+ noise_up_time_mult = torch.tensor(math.exp(-1.0 / (noise_up_time * measure_freq)), device=device)
1608
+ noise_down_time_mult = torch.tensor(math.exp(-1.0 / (noise_down_time * measure_freq)), device=device)
1609
+ measure_smooth_time_mult = math.exp(-1.0 / (measure_smooth_time * measure_freq))
1610
+ trigger_meas_time_mult = math.exp(-1.0 / (trigger_time * measure_freq))
1611
+
1612
+ boot_count_max = int(boot_time * measure_freq - 0.5)
1613
+ boot_count = measures_index = flushedLen_ns = 0
1614
+
1615
+ # pack batch
1616
+ shape = waveform.size()
1617
+ waveform = waveform.view(-1, shape[-1])
1618
+
1619
+ n_channels, ilen = waveform.size()
1620
+
1621
+ mean_meas = torch.zeros(n_channels, device=device)
1622
+ spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
1623
+ noise_spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
1624
+ measures = torch.zeros(n_channels, measures_len, device=device)
1625
+
1626
+ has_triggered: bool = False
1627
+ num_measures_to_flush: int = 0
1628
+
1629
+ pos = 0
1630
+ for pos in range(measure_len_ns, ilen, measure_period_ns):
1631
+ for i in range(n_channels):
1632
+ meas: float = _measure(
1633
+ measure_len_ws=measure_len_ws,
1634
+ samples=waveform[i, pos - measure_len_ws : pos],
1635
+ spectrum=spectrum[i],
1636
+ noise_spectrum=noise_spectrum[i],
1637
+ spectrum_window=spectrum_window,
1638
+ spectrum_start=spectrum_start,
1639
+ spectrum_end=spectrum_end,
1640
+ cepstrum_window=cepstrum_window,
1641
+ cepstrum_start=cepstrum_start,
1642
+ cepstrum_end=cepstrum_end,
1643
+ noise_reduction_amount=noise_reduction_amount,
1644
+ measure_smooth_time_mult=measure_smooth_time_mult,
1645
+ noise_up_time_mult=noise_up_time_mult,
1646
+ noise_down_time_mult=noise_down_time_mult,
1647
+ boot_count=boot_count,
1648
+ )
1649
+ measures[i, measures_index] = meas
1650
+ mean_meas[i] = mean_meas[i] * trigger_meas_time_mult + meas * (1.0 - trigger_meas_time_mult)
1651
+
1652
+ has_triggered = has_triggered or (mean_meas[i] >= trigger_level)
1653
+ if has_triggered:
1654
+ n: int = measures_len
1655
+ k: int = measures_index
1656
+ jTrigger: int = n
1657
+ jZero: int = n
1658
+ j: int = 0
1659
+
1660
+ for j in range(n):
1661
+ if (measures[i, k] >= trigger_level) and (j <= jTrigger + gap_len):
1662
+ jZero = jTrigger = j
1663
+ elif (measures[i, k] == 0) and (jTrigger >= jZero):
1664
+ jZero = j
1665
+ k = (k + n - 1) % n
1666
+ j = min(j, jZero)
1667
+ # num_measures_to_flush = range_limit(j, num_measures_to_flush, n);
1668
+ num_measures_to_flush = min(max(num_measures_to_flush, j), n)
1669
+ # end if has_triggered
1670
+ # end for channel
1671
+ measures_index += 1
1672
+ measures_index = measures_index % measures_len
1673
+ if boot_count >= 0:
1674
+ boot_count = -1 if boot_count == boot_count_max else boot_count + 1
1675
+
1676
+ if has_triggered:
1677
+ flushedLen_ns = (measures_len - num_measures_to_flush) * measure_period_ns
1678
+ break
1679
+ # end for window
1680
+ if not has_triggered and shape[-1] >= fixed_pre_trigger_len_ns:
1681
+ return waveform[..., :fixed_pre_trigger_len_ns].view(shape[:-1] + torch.Size([fixed_pre_trigger_len_ns]))
1682
+
1683
+ res = waveform[:, max(pos - samplesLen_ns + flushedLen_ns, 0) :]
1684
+ # unpack batch
1685
+ return res.view(shape[:-1] + res.shape[-1:])