torchaudio 2.8.0__cp313-cp313-win_amd64.whl → 2.9.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show
  1. torchaudio/__init__.py +179 -39
  2. torchaudio/_extension/__init__.py +1 -14
  3. torchaudio/_extension/utils.py +0 -47
  4. torchaudio/_internal/module_utils.py +12 -3
  5. torchaudio/_torchcodec.py +73 -85
  6. torchaudio/datasets/cmuarctic.py +1 -1
  7. torchaudio/datasets/utils.py +1 -1
  8. torchaudio/functional/__init__.py +0 -2
  9. torchaudio/functional/_alignment.py +1 -1
  10. torchaudio/functional/filtering.py +70 -55
  11. torchaudio/functional/functional.py +26 -60
  12. torchaudio/lib/_torchaudio.pyd +0 -0
  13. torchaudio/lib/libtorchaudio.pyd +0 -0
  14. torchaudio/models/decoder/__init__.py +14 -2
  15. torchaudio/models/decoder/_ctc_decoder.py +6 -6
  16. torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
  17. torchaudio/models/squim/objective.py +2 -2
  18. torchaudio/pipelines/_source_separation_pipeline.py +1 -1
  19. torchaudio/pipelines/_squim_pipeline.py +2 -2
  20. torchaudio/pipelines/_tts/utils.py +1 -1
  21. torchaudio/pipelines/rnnt_pipeline.py +4 -4
  22. torchaudio/transforms/__init__.py +1 -0
  23. torchaudio/transforms/_transforms.py +2 -2
  24. torchaudio/utils/__init__.py +2 -9
  25. torchaudio/utils/download.py +1 -3
  26. torchaudio/version.py +2 -2
  27. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/METADATA +8 -11
  28. torchaudio-2.9.0.dist-info/RECORD +85 -0
  29. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
  30. torchaudio/_backend/__init__.py +0 -61
  31. torchaudio/_backend/backend.py +0 -53
  32. torchaudio/_backend/common.py +0 -52
  33. torchaudio/_backend/ffmpeg.py +0 -334
  34. torchaudio/_backend/soundfile.py +0 -54
  35. torchaudio/_backend/soundfile_backend.py +0 -457
  36. torchaudio/_backend/sox.py +0 -91
  37. torchaudio/_backend/utils.py +0 -350
  38. torchaudio/backend/__init__.py +0 -8
  39. torchaudio/backend/_no_backend.py +0 -25
  40. torchaudio/backend/_sox_io_backend.py +0 -294
  41. torchaudio/backend/common.py +0 -13
  42. torchaudio/backend/no_backend.py +0 -14
  43. torchaudio/backend/soundfile_backend.py +0 -14
  44. torchaudio/backend/sox_io_backend.py +0 -14
  45. torchaudio/io/__init__.py +0 -20
  46. torchaudio/io/_effector.py +0 -347
  47. torchaudio/io/_playback.py +0 -72
  48. torchaudio/kaldi_io.py +0 -150
  49. torchaudio/prototype/__init__.py +0 -0
  50. torchaudio/prototype/datasets/__init__.py +0 -4
  51. torchaudio/prototype/datasets/musan.py +0 -68
  52. torchaudio/prototype/functional/__init__.py +0 -26
  53. torchaudio/prototype/functional/_dsp.py +0 -441
  54. torchaudio/prototype/functional/_rir.py +0 -382
  55. torchaudio/prototype/functional/functional.py +0 -193
  56. torchaudio/prototype/models/__init__.py +0 -39
  57. torchaudio/prototype/models/_conformer_wav2vec2.py +0 -801
  58. torchaudio/prototype/models/_emformer_hubert.py +0 -337
  59. torchaudio/prototype/models/conv_emformer.py +0 -529
  60. torchaudio/prototype/models/hifi_gan.py +0 -342
  61. torchaudio/prototype/models/rnnt.py +0 -717
  62. torchaudio/prototype/models/rnnt_decoder.py +0 -402
  63. torchaudio/prototype/pipelines/__init__.py +0 -21
  64. torchaudio/prototype/pipelines/_vggish/__init__.py +0 -7
  65. torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -236
  66. torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -83
  67. torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -233
  68. torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
  69. torchaudio/prototype/transforms/__init__.py +0 -9
  70. torchaudio/prototype/transforms/_transforms.py +0 -461
  71. torchaudio/sox_effects/__init__.py +0 -10
  72. torchaudio/sox_effects/sox_effects.py +0 -275
  73. torchaudio/utils/ffmpeg_utils.py +0 -11
  74. torchaudio/utils/sox_utils.py +0 -118
  75. torchaudio-2.8.0.dist-info/RECORD +0 -145
  76. torio/__init__.py +0 -8
  77. torio/_extension/__init__.py +0 -13
  78. torio/_extension/utils.py +0 -147
  79. torio/io/__init__.py +0 -9
  80. torio/io/_streaming_media_decoder.py +0 -977
  81. torio/io/_streaming_media_encoder.py +0 -502
  82. torio/lib/__init__.py +0 -0
  83. torio/lib/_torio_ffmpeg4.pyd +0 -0
  84. torio/lib/_torio_ffmpeg5.pyd +0 -0
  85. torio/lib/_torio_ffmpeg6.pyd +0 -0
  86. torio/lib/libtorio_ffmpeg4.pyd +0 -0
  87. torio/lib/libtorio_ffmpeg5.pyd +0 -0
  88. torio/lib/libtorio_ffmpeg6.pyd +0 -0
  89. torio/utils/__init__.py +0 -4
  90. torio/utils/ffmpeg_utils.py +0 -275
  91. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
  92. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,461 +0,0 @@
1
- from typing import Callable, Optional
2
-
3
- import torch
4
- from torchaudio.prototype.functional import barkscale_fbanks, chroma_filterbank
5
- from torchaudio.transforms import Spectrogram
6
- from torchaudio._internal.module_utils import dropping_support, dropping_class_support
7
-
8
- @dropping_class_support
9
- class BarkScale(torch.nn.Module):
10
- r"""Turn a normal STFT into a bark frequency STFT with triangular filter banks.
11
-
12
- .. devices:: CPU CUDA
13
-
14
- .. properties:: Autograd TorchScript
15
-
16
- Args:
17
- n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
18
- sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
19
- f_min (float, optional): Minimum frequency. (Default: ``0.``)
20
- f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
21
- n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
22
- norm (str or None, optional): If ``"slaney"``, divide the triangular bark weights by the width of the bark band
23
- (area normalization). (Default: ``None``)
24
- bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
25
-
26
- Example
27
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
28
- >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
29
- >>> spectrogram = spectrogram_transform(waveform)
30
- >>> barkscale_transform = transforms.BarkScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
31
- >>> barkscale_spectrogram = barkscale_transform(spectrogram)
32
-
33
- See also:
34
- :py:func:`torchaudio.prototype.functional.barkscale_fbanks` - The function used to
35
- generate the filter banks.
36
- """
37
- __constants__ = ["n_barks", "sample_rate", "f_min", "f_max"]
38
-
39
- def __init__(
40
- self,
41
- n_barks: int = 128,
42
- sample_rate: int = 16000,
43
- f_min: float = 0.0,
44
- f_max: Optional[float] = None,
45
- n_stft: int = 201,
46
- bark_scale: str = "traunmuller",
47
- ) -> None:
48
- super(BarkScale, self).__init__()
49
- self.n_barks = n_barks
50
- self.sample_rate = sample_rate
51
- self.f_max = f_max if f_max is not None else float(sample_rate // 2)
52
- self.f_min = f_min
53
- self.bark_scale = bark_scale
54
-
55
- if f_min > self.f_max:
56
- raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
57
-
58
- fb = barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, self.bark_scale)
59
- self.register_buffer("fb", fb)
60
-
61
- def forward(self, specgram: torch.Tensor) -> torch.Tensor:
62
- r"""
63
- Args:
64
- specgram (torch.Tensor): A spectrogram STFT of dimension (..., freq, time).
65
-
66
- Returns:
67
- torch.Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
68
- """
69
-
70
- # (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
71
- bark_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2)
72
-
73
- return bark_specgram
74
-
75
-
76
- @dropping_class_support
77
- class InverseBarkScale(torch.nn.Module):
78
- r"""Estimate a STFT in normal frequency domain from bark frequency domain.
79
-
80
- .. devices:: CPU CUDA
81
-
82
- It minimizes the euclidian norm between the input bark-spectrogram and the product between
83
- the estimated spectrogram and the filter banks using SGD.
84
-
85
- Args:
86
- n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
87
- n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
88
- sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
89
- f_min (float, optional): Minimum frequency. (Default: ``0.``)
90
- f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
91
- max_iter (int, optional): Maximum number of optimization iterations. (Default: ``100000``)
92
- tolerance_loss (float, optional): Value of loss to stop optimization at. (Default: ``1e-5``)
93
- tolerance_change (float, optional): Difference in losses to stop optimization at. (Default: ``1e-8``)
94
- sgdargs (dict or None, optional): Arguments for the SGD optimizer. (Default: ``None``)
95
- bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
96
-
97
- Example
98
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
99
- >>> mel_spectrogram_transform = transforms.BarkSpectrogram(sample_rate, n_fft=1024)
100
- >>> mel_spectrogram = bark_spectrogram_transform(waveform)
101
- >>> inverse_barkscale_transform = transforms.InverseBarkScale(n_stft=1024 // 2 + 1)
102
- >>> spectrogram = inverse_barkscale_transform(mel_spectrogram)
103
- """
104
- __constants__ = [
105
- "n_stft",
106
- "n_barks",
107
- "sample_rate",
108
- "f_min",
109
- "f_max",
110
- "max_iter",
111
- "tolerance_loss",
112
- "tolerance_change",
113
- "sgdargs",
114
- ]
115
-
116
- def __init__(
117
- self,
118
- n_stft: int,
119
- n_barks: int = 128,
120
- sample_rate: int = 16000,
121
- f_min: float = 0.0,
122
- f_max: Optional[float] = None,
123
- max_iter: int = 100000,
124
- tolerance_loss: float = 1e-5,
125
- tolerance_change: float = 1e-8,
126
- sgdargs: Optional[dict] = None,
127
- bark_scale: str = "traunmuller",
128
- ) -> None:
129
- super(InverseBarkScale, self).__init__()
130
- self.n_barks = n_barks
131
- self.sample_rate = sample_rate
132
- self.f_max = f_max or float(sample_rate // 2)
133
- self.f_min = f_min
134
- self.max_iter = max_iter
135
- self.tolerance_loss = tolerance_loss
136
- self.tolerance_change = tolerance_change
137
- self.sgdargs = sgdargs or {"lr": 0.1, "momentum": 0.9}
138
-
139
- if f_min > self.f_max:
140
- raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
141
-
142
- fb = barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, bark_scale)
143
- self.register_buffer("fb", fb)
144
-
145
- def forward(self, barkspec: torch.Tensor) -> torch.Tensor:
146
- r"""
147
- Args:
148
- barkspec (torch.Tensor): A Bark frequency spectrogram of dimension (..., ``n_barks``, time)
149
-
150
- Returns:
151
- torch.Tensor: Linear scale spectrogram of size (..., freq, time)
152
- """
153
- # pack batch
154
- shape = barkspec.size()
155
- barkspec = barkspec.view(-1, shape[-2], shape[-1])
156
-
157
- n_barks, time = shape[-2], shape[-1]
158
- freq, _ = self.fb.size() # (freq, n_mels)
159
- barkspec = barkspec.transpose(-1, -2)
160
- if self.n_barks != n_barks:
161
- raise ValueError("Expected an input with {} bark bins. Found: {}".format(self.n_barks, n_barks))
162
-
163
- specgram = torch.rand(
164
- barkspec.size()[0], time, freq, requires_grad=True, dtype=barkspec.dtype, device=barkspec.device
165
- )
166
-
167
- optim = torch.optim.SGD([specgram], **self.sgdargs)
168
-
169
- loss = float("inf")
170
- for _ in range(self.max_iter):
171
- optim.zero_grad()
172
- diff = barkspec - specgram.matmul(self.fb)
173
- new_loss = diff.pow(2).sum(axis=-1).mean()
174
- # take sum over bark-frequency then average over other dimensions
175
- # so that loss threshold is applied par unit timeframe
176
- new_loss.backward()
177
- optim.step()
178
- specgram.data = specgram.data.clamp(min=0)
179
-
180
- new_loss = new_loss.item()
181
- if new_loss < self.tolerance_loss or abs(loss - new_loss) < self.tolerance_change:
182
- break
183
- loss = new_loss
184
-
185
- specgram.requires_grad_(False)
186
- specgram = specgram.clamp(min=0).transpose(-1, -2)
187
-
188
- # unpack batch
189
- specgram = specgram.view(shape[:-2] + (freq, time))
190
- return specgram
191
-
192
-
193
- @dropping_class_support
194
- class BarkSpectrogram(torch.nn.Module):
195
- r"""Create BarkSpectrogram for a raw audio signal.
196
-
197
- .. devices:: CPU CUDA
198
-
199
- .. properties:: Autograd TorchScript
200
-
201
- This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and
202
- and :py:func:`torchaudio.transforms.BarkScale`.
203
-
204
- Sources
205
- * https://www.fon.hum.uva.nl/praat/manual/BarkSpectrogram.html
206
- * Traunmüller, Hartmut. "Analytical Expressions for the Tonotopic Sensory Scale." Journal of the Acoustical
207
- * Society of America. Vol. 88, Issue 1, 1990, pp. 97–100.
208
- * https://ccrma.stanford.edu/courses/120-fall-2003/lecture-5.html
209
-
210
- Args:
211
- sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
212
- n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
213
- win_length (int or None, optional): Window size. (Default: ``n_fft``)
214
- hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
215
- f_min (float, optional): Minimum frequency. (Default: ``0.``)
216
- f_max (float or None, optional): Maximum frequency. (Default: ``None``)
217
- pad (int, optional): Two sided padding of signal. (Default: ``0``)
218
- n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
219
- window_fn (Callable[..., torch.Tensor], optional): A function to create a window tensor
220
- that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
221
- power (float, optional): Exponent for the magnitude spectrogram,
222
- (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
223
- normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
224
- wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
225
- center (bool, optional): whether to pad :attr:`waveform` on both sides so
226
- that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
227
- (Default: ``True``)
228
- pad_mode (string, optional): controls the padding method used when
229
- :attr:`center` is ``True``. (Default: ``"reflect"``)
230
- bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
231
-
232
- Example
233
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
234
- >>> transform = transforms.BarkSpectrogram(sample_rate)
235
- >>> bark_specgram = transform(waveform) # (channel, n_barks, time)
236
-
237
- See also:
238
- :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
239
- generate the filter banks.
240
- """
241
- __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_barks", "f_min"]
242
-
243
- def __init__(
244
- self,
245
- sample_rate: int = 16000,
246
- n_fft: int = 400,
247
- win_length: Optional[int] = None,
248
- hop_length: Optional[int] = None,
249
- f_min: float = 0.0,
250
- f_max: Optional[float] = None,
251
- pad: int = 0,
252
- n_barks: int = 128,
253
- window_fn: Callable[..., torch.Tensor] = torch.hann_window,
254
- power: float = 2.0,
255
- normalized: bool = False,
256
- wkwargs: Optional[dict] = None,
257
- center: bool = True,
258
- pad_mode: str = "reflect",
259
- bark_scale: str = "traunmuller",
260
- ) -> None:
261
- super(BarkSpectrogram, self).__init__()
262
-
263
- self.sample_rate = sample_rate
264
- self.n_fft = n_fft
265
- self.win_length = win_length if win_length is not None else n_fft
266
- self.hop_length = hop_length if hop_length is not None else self.win_length // 2
267
- self.pad = pad
268
- self.power = power
269
- self.normalized = normalized
270
- self.n_barks = n_barks # number of bark frequency bins
271
- self.f_max = f_max
272
- self.f_min = f_min
273
- self.spectrogram = Spectrogram(
274
- n_fft=self.n_fft,
275
- win_length=self.win_length,
276
- hop_length=self.hop_length,
277
- pad=self.pad,
278
- window_fn=window_fn,
279
- power=self.power,
280
- normalized=self.normalized,
281
- wkwargs=wkwargs,
282
- center=center,
283
- pad_mode=pad_mode,
284
- onesided=True,
285
- )
286
- self.bark_scale = BarkScale(
287
- self.n_barks, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, bark_scale
288
- )
289
-
290
- def forward(self, waveform: torch.Tensor) -> torch.Tensor:
291
- r"""
292
- Args:
293
- waveform (torch.Tensor): torch.Tensor of audio of dimension (..., time).
294
-
295
- Returns:
296
- torch.Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
297
- """
298
- specgram = self.spectrogram(waveform)
299
- bark_specgram = self.bark_scale(specgram)
300
- return bark_specgram
301
-
302
-
303
- @dropping_class_support
304
- class ChromaScale(torch.nn.Module):
305
- r"""Converts spectrogram to chromagram.
306
-
307
- .. devices:: CPU CUDA
308
-
309
- .. properties:: Autograd
310
-
311
- Args:
312
- sample_rate (int): Sample rate of audio signal.
313
- n_freqs (int): Number of frequency bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
314
- n_chroma (int, optional): Number of chroma. (Default: ``12``)
315
- tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
316
- ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
317
- octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
318
- If ``None``, then disable weighting altogether. (Default: 2.0)
319
- norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
320
- base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
321
-
322
- Example
323
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
324
- >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
325
- >>> spectrogram = spectrogram_transform(waveform)
326
- >>> chroma_transform = transforms.ChromaScale(sample_rate=sample_rate, n_freqs=1024 // 2 + 1)
327
- >>> chroma_spectrogram = chroma_transform(spectrogram)
328
-
329
- See also:
330
- :py:func:`torchaudio.prototype.functional.chroma_filterbank` — function used to
331
- generate the filter bank.
332
- """
333
-
334
- def __init__(
335
- self,
336
- sample_rate: int,
337
- n_freqs: int,
338
- *,
339
- n_chroma: int = 12,
340
- tuning: float = 0.0,
341
- ctroct: float = 5.0,
342
- octwidth: Optional[float] = 2.0,
343
- norm: int = 2,
344
- base_c: bool = True,
345
- ):
346
- super().__init__()
347
- fb = chroma_filterbank(
348
- sample_rate, n_freqs, n_chroma, tuning=tuning, ctroct=ctroct, octwidth=octwidth, norm=norm, base_c=base_c
349
- )
350
- self.register_buffer("fb", fb)
351
-
352
- def forward(self, x: torch.Tensor) -> torch.Tensor:
353
- r"""
354
- Args:
355
- specgram (torch.Tensor): Spectrogram of dimension (..., ``n_freqs``, time).
356
-
357
- Returns:
358
- torch.Tensor: Chroma spectrogram of size (..., ``n_chroma``, time).
359
- """
360
- return torch.matmul(x.transpose(-1, -2), self.fb).transpose(-1, -2)
361
-
362
-
363
- @dropping_class_support
364
- class ChromaSpectrogram(torch.nn.Module):
365
- r"""Generates chromagram for audio signal.
366
-
367
- .. devices:: CPU CUDA
368
-
369
- .. properties:: Autograd
370
-
371
- Composes :py:func:`torchaudio.transforms.Spectrogram` and
372
- and :py:func:`torchaudio.prototype.transforms.ChromaScale`.
373
-
374
- Args:
375
- sample_rate (int): Sample rate of audio signal.
376
- n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins.
377
- win_length (int or None, optional): Window size. (Default: ``n_fft``)
378
- hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
379
- pad (int, optional): Two sided padding of signal. (Default: ``0``)
380
- window_fn (Callable[..., torch.Tensor], optional): A function to create a window tensor
381
- that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
382
- power (float, optional): Exponent for the magnitude spectrogram,
383
- (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
384
- normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
385
- wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
386
- center (bool, optional): whether to pad :attr:`waveform` on both sides so
387
- that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
388
- (Default: ``True``)
389
- pad_mode (string, optional): controls the padding method used when
390
- :attr:`center` is ``True``. (Default: ``"reflect"``)
391
- n_chroma (int, optional): Number of chroma. (Default: ``12``)
392
- tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
393
- ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
394
- octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
395
- If ``None``, then disable weighting altogether. (Default: 2.0)
396
- norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
397
- base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
398
-
399
- Example
400
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
401
- >>> transform = transforms.ChromaSpectrogram(sample_rate=sample_rate, n_fft=400)
402
- >>> chromagram = transform(waveform) # (channel, n_chroma, time)
403
- """
404
-
405
- def __init__(
406
- self,
407
- sample_rate: int,
408
- n_fft: int,
409
- *,
410
- win_length: Optional[int] = None,
411
- hop_length: Optional[int] = None,
412
- pad: int = 0,
413
- window_fn: Callable[..., torch.Tensor] = torch.hann_window,
414
- power: float = 2.0,
415
- normalized: bool = False,
416
- wkwargs: Optional[dict] = None,
417
- center: bool = True,
418
- pad_mode: str = "reflect",
419
- n_chroma: int = 12,
420
- tuning: float = 0.0,
421
- ctroct: float = 5.0,
422
- octwidth: Optional[float] = 2.0,
423
- norm: int = 2,
424
- base_c: bool = True,
425
- ):
426
- super().__init__()
427
- self.spectrogram = Spectrogram(
428
- n_fft=n_fft,
429
- win_length=win_length,
430
- hop_length=hop_length,
431
- pad=pad,
432
- window_fn=window_fn,
433
- power=power,
434
- normalized=normalized,
435
- wkwargs=wkwargs,
436
- center=center,
437
- pad_mode=pad_mode,
438
- onesided=True,
439
- )
440
- self.chroma_scale = ChromaScale(
441
- sample_rate,
442
- n_fft // 2 + 1,
443
- n_chroma=n_chroma,
444
- tuning=tuning,
445
- base_c=base_c,
446
- ctroct=ctroct,
447
- octwidth=octwidth,
448
- norm=norm,
449
- )
450
-
451
- def forward(self, waveform: torch.Tensor) -> torch.Tensor:
452
- r"""
453
- Args:
454
- waveform (Tensor): Tensor of audio of dimension (..., time).
455
-
456
- Returns:
457
- Tensor: Chromagram of size (..., ``n_chroma``, time).
458
- """
459
- spectrogram = self.spectrogram(waveform)
460
- chroma_spectrogram = self.chroma_scale(spectrogram)
461
- return chroma_spectrogram
@@ -1,10 +0,0 @@
1
- from .sox_effects import apply_effects_file, apply_effects_tensor, effect_names, init_sox_effects, shutdown_sox_effects
2
-
3
-
4
- __all__ = [
5
- "init_sox_effects",
6
- "shutdown_sox_effects",
7
- "effect_names",
8
- "apply_effects_tensor",
9
- "apply_effects_file",
10
- ]