torchaudio 2.7.1__cp311-cp311-win_amd64.whl → 2.9.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show
  1. torchaudio/__init__.py +184 -33
  2. torchaudio/_extension/__init__.py +1 -14
  3. torchaudio/_extension/utils.py +0 -47
  4. torchaudio/_internal/module_utils.py +68 -10
  5. torchaudio/_torchcodec.py +340 -0
  6. torchaudio/datasets/cmuarctic.py +1 -1
  7. torchaudio/datasets/utils.py +1 -1
  8. torchaudio/functional/__init__.py +6 -3
  9. torchaudio/functional/_alignment.py +1 -1
  10. torchaudio/functional/filtering.py +70 -55
  11. torchaudio/functional/functional.py +31 -61
  12. torchaudio/lib/_torchaudio.pyd +0 -0
  13. torchaudio/lib/libtorchaudio.pyd +0 -0
  14. torchaudio/models/decoder/__init__.py +19 -1
  15. torchaudio/models/decoder/_ctc_decoder.py +6 -6
  16. torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
  17. torchaudio/models/squim/objective.py +2 -2
  18. torchaudio/pipelines/_source_separation_pipeline.py +1 -1
  19. torchaudio/pipelines/_squim_pipeline.py +2 -2
  20. torchaudio/pipelines/_tts/utils.py +3 -1
  21. torchaudio/pipelines/rnnt_pipeline.py +4 -4
  22. torchaudio/transforms/__init__.py +4 -1
  23. torchaudio/transforms/_transforms.py +4 -3
  24. torchaudio/utils/__init__.py +2 -9
  25. torchaudio/utils/download.py +1 -1
  26. torchaudio/version.py +2 -2
  27. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/METADATA +15 -7
  28. torchaudio-2.9.0.dist-info/RECORD +85 -0
  29. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
  30. torchaudio/_backend/__init__.py +0 -61
  31. torchaudio/_backend/backend.py +0 -53
  32. torchaudio/_backend/common.py +0 -52
  33. torchaudio/_backend/ffmpeg.py +0 -334
  34. torchaudio/_backend/soundfile.py +0 -54
  35. torchaudio/_backend/soundfile_backend.py +0 -457
  36. torchaudio/_backend/sox.py +0 -91
  37. torchaudio/_backend/utils.py +0 -317
  38. torchaudio/backend/__init__.py +0 -8
  39. torchaudio/backend/_no_backend.py +0 -25
  40. torchaudio/backend/_sox_io_backend.py +0 -294
  41. torchaudio/backend/common.py +0 -13
  42. torchaudio/backend/no_backend.py +0 -14
  43. torchaudio/backend/soundfile_backend.py +0 -14
  44. torchaudio/backend/sox_io_backend.py +0 -14
  45. torchaudio/io/__init__.py +0 -13
  46. torchaudio/io/_effector.py +0 -347
  47. torchaudio/io/_playback.py +0 -72
  48. torchaudio/kaldi_io.py +0 -144
  49. torchaudio/prototype/__init__.py +0 -0
  50. torchaudio/prototype/datasets/__init__.py +0 -4
  51. torchaudio/prototype/datasets/musan.py +0 -67
  52. torchaudio/prototype/functional/__init__.py +0 -26
  53. torchaudio/prototype/functional/_dsp.py +0 -433
  54. torchaudio/prototype/functional/_rir.py +0 -379
  55. torchaudio/prototype/functional/functional.py +0 -190
  56. torchaudio/prototype/models/__init__.py +0 -36
  57. torchaudio/prototype/models/_conformer_wav2vec2.py +0 -794
  58. torchaudio/prototype/models/_emformer_hubert.py +0 -333
  59. torchaudio/prototype/models/conv_emformer.py +0 -525
  60. torchaudio/prototype/models/hifi_gan.py +0 -336
  61. torchaudio/prototype/models/rnnt.py +0 -711
  62. torchaudio/prototype/models/rnnt_decoder.py +0 -399
  63. torchaudio/prototype/pipelines/__init__.py +0 -12
  64. torchaudio/prototype/pipelines/_vggish/__init__.py +0 -3
  65. torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -233
  66. torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -82
  67. torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -228
  68. torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
  69. torchaudio/prototype/transforms/__init__.py +0 -9
  70. torchaudio/prototype/transforms/_transforms.py +0 -456
  71. torchaudio/sox_effects/__init__.py +0 -10
  72. torchaudio/sox_effects/sox_effects.py +0 -272
  73. torchaudio/utils/ffmpeg_utils.py +0 -11
  74. torchaudio/utils/sox_utils.py +0 -99
  75. torchaudio-2.7.1.dist-info/RECORD +0 -144
  76. torio/__init__.py +0 -8
  77. torio/_extension/__init__.py +0 -13
  78. torio/_extension/utils.py +0 -147
  79. torio/io/__init__.py +0 -9
  80. torio/io/_streaming_media_decoder.py +0 -978
  81. torio/io/_streaming_media_encoder.py +0 -502
  82. torio/lib/__init__.py +0 -0
  83. torio/lib/_torio_ffmpeg4.pyd +0 -0
  84. torio/lib/_torio_ffmpeg5.pyd +0 -0
  85. torio/lib/_torio_ffmpeg6.pyd +0 -0
  86. torio/lib/libtorio_ffmpeg4.pyd +0 -0
  87. torio/lib/libtorio_ffmpeg5.pyd +0 -0
  88. torio/lib/libtorio_ffmpeg6.pyd +0 -0
  89. torio/utils/__init__.py +0 -4
  90. torio/utils/ffmpeg_utils.py +0 -247
  91. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
  92. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,456 +0,0 @@
1
- from typing import Callable, Optional
2
-
3
- import torch
4
- from torchaudio.prototype.functional import barkscale_fbanks, chroma_filterbank
5
- from torchaudio.transforms import Spectrogram
6
-
7
-
8
- class BarkScale(torch.nn.Module):
9
- r"""Turn a normal STFT into a bark frequency STFT with triangular filter banks.
10
-
11
- .. devices:: CPU CUDA
12
-
13
- .. properties:: Autograd TorchScript
14
-
15
- Args:
16
- n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
17
- sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
18
- f_min (float, optional): Minimum frequency. (Default: ``0.``)
19
- f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
20
- n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
21
- norm (str or None, optional): If ``"slaney"``, divide the triangular bark weights by the width of the bark band
22
- (area normalization). (Default: ``None``)
23
- bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
24
-
25
- Example
26
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
27
- >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
28
- >>> spectrogram = spectrogram_transform(waveform)
29
- >>> barkscale_transform = transforms.BarkScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
30
- >>> barkscale_spectrogram = barkscale_transform(spectrogram)
31
-
32
- See also:
33
- :py:func:`torchaudio.prototype.functional.barkscale_fbanks` - The function used to
34
- generate the filter banks.
35
- """
36
- __constants__ = ["n_barks", "sample_rate", "f_min", "f_max"]
37
-
38
- def __init__(
39
- self,
40
- n_barks: int = 128,
41
- sample_rate: int = 16000,
42
- f_min: float = 0.0,
43
- f_max: Optional[float] = None,
44
- n_stft: int = 201,
45
- bark_scale: str = "traunmuller",
46
- ) -> None:
47
- super(BarkScale, self).__init__()
48
- self.n_barks = n_barks
49
- self.sample_rate = sample_rate
50
- self.f_max = f_max if f_max is not None else float(sample_rate // 2)
51
- self.f_min = f_min
52
- self.bark_scale = bark_scale
53
-
54
- if f_min > self.f_max:
55
- raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
56
-
57
- fb = barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, self.bark_scale)
58
- self.register_buffer("fb", fb)
59
-
60
- def forward(self, specgram: torch.Tensor) -> torch.Tensor:
61
- r"""
62
- Args:
63
- specgram (torch.Tensor): A spectrogram STFT of dimension (..., freq, time).
64
-
65
- Returns:
66
- torch.Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
67
- """
68
-
69
- # (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
70
- bark_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2)
71
-
72
- return bark_specgram
73
-
74
-
75
- class InverseBarkScale(torch.nn.Module):
76
- r"""Estimate a STFT in normal frequency domain from bark frequency domain.
77
-
78
- .. devices:: CPU CUDA
79
-
80
- It minimizes the euclidian norm between the input bark-spectrogram and the product between
81
- the estimated spectrogram and the filter banks using SGD.
82
-
83
- Args:
84
- n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
85
- n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
86
- sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
87
- f_min (float, optional): Minimum frequency. (Default: ``0.``)
88
- f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
89
- max_iter (int, optional): Maximum number of optimization iterations. (Default: ``100000``)
90
- tolerance_loss (float, optional): Value of loss to stop optimization at. (Default: ``1e-5``)
91
- tolerance_change (float, optional): Difference in losses to stop optimization at. (Default: ``1e-8``)
92
- sgdargs (dict or None, optional): Arguments for the SGD optimizer. (Default: ``None``)
93
- bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
94
-
95
- Example
96
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
97
- >>> mel_spectrogram_transform = transforms.BarkSpectrogram(sample_rate, n_fft=1024)
98
- >>> mel_spectrogram = bark_spectrogram_transform(waveform)
99
- >>> inverse_barkscale_transform = transforms.InverseBarkScale(n_stft=1024 // 2 + 1)
100
- >>> spectrogram = inverse_barkscale_transform(mel_spectrogram)
101
- """
102
- __constants__ = [
103
- "n_stft",
104
- "n_barks",
105
- "sample_rate",
106
- "f_min",
107
- "f_max",
108
- "max_iter",
109
- "tolerance_loss",
110
- "tolerance_change",
111
- "sgdargs",
112
- ]
113
-
114
- def __init__(
115
- self,
116
- n_stft: int,
117
- n_barks: int = 128,
118
- sample_rate: int = 16000,
119
- f_min: float = 0.0,
120
- f_max: Optional[float] = None,
121
- max_iter: int = 100000,
122
- tolerance_loss: float = 1e-5,
123
- tolerance_change: float = 1e-8,
124
- sgdargs: Optional[dict] = None,
125
- bark_scale: str = "traunmuller",
126
- ) -> None:
127
- super(InverseBarkScale, self).__init__()
128
- self.n_barks = n_barks
129
- self.sample_rate = sample_rate
130
- self.f_max = f_max or float(sample_rate // 2)
131
- self.f_min = f_min
132
- self.max_iter = max_iter
133
- self.tolerance_loss = tolerance_loss
134
- self.tolerance_change = tolerance_change
135
- self.sgdargs = sgdargs or {"lr": 0.1, "momentum": 0.9}
136
-
137
- if f_min > self.f_max:
138
- raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
139
-
140
- fb = barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, bark_scale)
141
- self.register_buffer("fb", fb)
142
-
143
- def forward(self, barkspec: torch.Tensor) -> torch.Tensor:
144
- r"""
145
- Args:
146
- barkspec (torch.Tensor): A Bark frequency spectrogram of dimension (..., ``n_barks``, time)
147
-
148
- Returns:
149
- torch.Tensor: Linear scale spectrogram of size (..., freq, time)
150
- """
151
- # pack batch
152
- shape = barkspec.size()
153
- barkspec = barkspec.view(-1, shape[-2], shape[-1])
154
-
155
- n_barks, time = shape[-2], shape[-1]
156
- freq, _ = self.fb.size() # (freq, n_mels)
157
- barkspec = barkspec.transpose(-1, -2)
158
- if self.n_barks != n_barks:
159
- raise ValueError("Expected an input with {} bark bins. Found: {}".format(self.n_barks, n_barks))
160
-
161
- specgram = torch.rand(
162
- barkspec.size()[0], time, freq, requires_grad=True, dtype=barkspec.dtype, device=barkspec.device
163
- )
164
-
165
- optim = torch.optim.SGD([specgram], **self.sgdargs)
166
-
167
- loss = float("inf")
168
- for _ in range(self.max_iter):
169
- optim.zero_grad()
170
- diff = barkspec - specgram.matmul(self.fb)
171
- new_loss = diff.pow(2).sum(axis=-1).mean()
172
- # take sum over bark-frequency then average over other dimensions
173
- # so that loss threshold is applied par unit timeframe
174
- new_loss.backward()
175
- optim.step()
176
- specgram.data = specgram.data.clamp(min=0)
177
-
178
- new_loss = new_loss.item()
179
- if new_loss < self.tolerance_loss or abs(loss - new_loss) < self.tolerance_change:
180
- break
181
- loss = new_loss
182
-
183
- specgram.requires_grad_(False)
184
- specgram = specgram.clamp(min=0).transpose(-1, -2)
185
-
186
- # unpack batch
187
- specgram = specgram.view(shape[:-2] + (freq, time))
188
- return specgram
189
-
190
-
191
- class BarkSpectrogram(torch.nn.Module):
192
- r"""Create BarkSpectrogram for a raw audio signal.
193
-
194
- .. devices:: CPU CUDA
195
-
196
- .. properties:: Autograd TorchScript
197
-
198
- This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and
199
- and :py:func:`torchaudio.transforms.BarkScale`.
200
-
201
- Sources
202
- * https://www.fon.hum.uva.nl/praat/manual/BarkSpectrogram.html
203
- * Traunmüller, Hartmut. "Analytical Expressions for the Tonotopic Sensory Scale." Journal of the Acoustical
204
- * Society of America. Vol. 88, Issue 1, 1990, pp. 97–100.
205
- * https://ccrma.stanford.edu/courses/120-fall-2003/lecture-5.html
206
-
207
- Args:
208
- sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
209
- n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
210
- win_length (int or None, optional): Window size. (Default: ``n_fft``)
211
- hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
212
- f_min (float, optional): Minimum frequency. (Default: ``0.``)
213
- f_max (float or None, optional): Maximum frequency. (Default: ``None``)
214
- pad (int, optional): Two sided padding of signal. (Default: ``0``)
215
- n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
216
- window_fn (Callable[..., torch.Tensor], optional): A function to create a window tensor
217
- that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
218
- power (float, optional): Exponent for the magnitude spectrogram,
219
- (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
220
- normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
221
- wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
222
- center (bool, optional): whether to pad :attr:`waveform` on both sides so
223
- that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
224
- (Default: ``True``)
225
- pad_mode (string, optional): controls the padding method used when
226
- :attr:`center` is ``True``. (Default: ``"reflect"``)
227
- bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
228
-
229
- Example
230
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
231
- >>> transform = transforms.BarkSpectrogram(sample_rate)
232
- >>> bark_specgram = transform(waveform) # (channel, n_barks, time)
233
-
234
- See also:
235
- :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
236
- generate the filter banks.
237
- """
238
- __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_barks", "f_min"]
239
-
240
- def __init__(
241
- self,
242
- sample_rate: int = 16000,
243
- n_fft: int = 400,
244
- win_length: Optional[int] = None,
245
- hop_length: Optional[int] = None,
246
- f_min: float = 0.0,
247
- f_max: Optional[float] = None,
248
- pad: int = 0,
249
- n_barks: int = 128,
250
- window_fn: Callable[..., torch.Tensor] = torch.hann_window,
251
- power: float = 2.0,
252
- normalized: bool = False,
253
- wkwargs: Optional[dict] = None,
254
- center: bool = True,
255
- pad_mode: str = "reflect",
256
- bark_scale: str = "traunmuller",
257
- ) -> None:
258
- super(BarkSpectrogram, self).__init__()
259
-
260
- self.sample_rate = sample_rate
261
- self.n_fft = n_fft
262
- self.win_length = win_length if win_length is not None else n_fft
263
- self.hop_length = hop_length if hop_length is not None else self.win_length // 2
264
- self.pad = pad
265
- self.power = power
266
- self.normalized = normalized
267
- self.n_barks = n_barks # number of bark frequency bins
268
- self.f_max = f_max
269
- self.f_min = f_min
270
- self.spectrogram = Spectrogram(
271
- n_fft=self.n_fft,
272
- win_length=self.win_length,
273
- hop_length=self.hop_length,
274
- pad=self.pad,
275
- window_fn=window_fn,
276
- power=self.power,
277
- normalized=self.normalized,
278
- wkwargs=wkwargs,
279
- center=center,
280
- pad_mode=pad_mode,
281
- onesided=True,
282
- )
283
- self.bark_scale = BarkScale(
284
- self.n_barks, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, bark_scale
285
- )
286
-
287
- def forward(self, waveform: torch.Tensor) -> torch.Tensor:
288
- r"""
289
- Args:
290
- waveform (torch.Tensor): torch.Tensor of audio of dimension (..., time).
291
-
292
- Returns:
293
- torch.Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
294
- """
295
- specgram = self.spectrogram(waveform)
296
- bark_specgram = self.bark_scale(specgram)
297
- return bark_specgram
298
-
299
-
300
- class ChromaScale(torch.nn.Module):
301
- r"""Converts spectrogram to chromagram.
302
-
303
- .. devices:: CPU CUDA
304
-
305
- .. properties:: Autograd
306
-
307
- Args:
308
- sample_rate (int): Sample rate of audio signal.
309
- n_freqs (int): Number of frequency bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
310
- n_chroma (int, optional): Number of chroma. (Default: ``12``)
311
- tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
312
- ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
313
- octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
314
- If ``None``, then disable weighting altogether. (Default: 2.0)
315
- norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
316
- base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
317
-
318
- Example
319
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
320
- >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
321
- >>> spectrogram = spectrogram_transform(waveform)
322
- >>> chroma_transform = transforms.ChromaScale(sample_rate=sample_rate, n_freqs=1024 // 2 + 1)
323
- >>> chroma_spectrogram = chroma_transform(spectrogram)
324
-
325
- See also:
326
- :py:func:`torchaudio.prototype.functional.chroma_filterbank` — function used to
327
- generate the filter bank.
328
- """
329
-
330
- def __init__(
331
- self,
332
- sample_rate: int,
333
- n_freqs: int,
334
- *,
335
- n_chroma: int = 12,
336
- tuning: float = 0.0,
337
- ctroct: float = 5.0,
338
- octwidth: Optional[float] = 2.0,
339
- norm: int = 2,
340
- base_c: bool = True,
341
- ):
342
- super().__init__()
343
- fb = chroma_filterbank(
344
- sample_rate, n_freqs, n_chroma, tuning=tuning, ctroct=ctroct, octwidth=octwidth, norm=norm, base_c=base_c
345
- )
346
- self.register_buffer("fb", fb)
347
-
348
- def forward(self, x: torch.Tensor) -> torch.Tensor:
349
- r"""
350
- Args:
351
- specgram (torch.Tensor): Spectrogram of dimension (..., ``n_freqs``, time).
352
-
353
- Returns:
354
- torch.Tensor: Chroma spectrogram of size (..., ``n_chroma``, time).
355
- """
356
- return torch.matmul(x.transpose(-1, -2), self.fb).transpose(-1, -2)
357
-
358
-
359
- class ChromaSpectrogram(torch.nn.Module):
360
- r"""Generates chromagram for audio signal.
361
-
362
- .. devices:: CPU CUDA
363
-
364
- .. properties:: Autograd
365
-
366
- Composes :py:func:`torchaudio.transforms.Spectrogram` and
367
- and :py:func:`torchaudio.prototype.transforms.ChromaScale`.
368
-
369
- Args:
370
- sample_rate (int): Sample rate of audio signal.
371
- n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins.
372
- win_length (int or None, optional): Window size. (Default: ``n_fft``)
373
- hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
374
- pad (int, optional): Two sided padding of signal. (Default: ``0``)
375
- window_fn (Callable[..., torch.Tensor], optional): A function to create a window tensor
376
- that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
377
- power (float, optional): Exponent for the magnitude spectrogram,
378
- (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
379
- normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
380
- wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
381
- center (bool, optional): whether to pad :attr:`waveform` on both sides so
382
- that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
383
- (Default: ``True``)
384
- pad_mode (string, optional): controls the padding method used when
385
- :attr:`center` is ``True``. (Default: ``"reflect"``)
386
- n_chroma (int, optional): Number of chroma. (Default: ``12``)
387
- tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
388
- ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
389
- octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
390
- If ``None``, then disable weighting altogether. (Default: 2.0)
391
- norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
392
- base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
393
-
394
- Example
395
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
396
- >>> transform = transforms.ChromaSpectrogram(sample_rate=sample_rate, n_fft=400)
397
- >>> chromagram = transform(waveform) # (channel, n_chroma, time)
398
- """
399
-
400
- def __init__(
401
- self,
402
- sample_rate: int,
403
- n_fft: int,
404
- *,
405
- win_length: Optional[int] = None,
406
- hop_length: Optional[int] = None,
407
- pad: int = 0,
408
- window_fn: Callable[..., torch.Tensor] = torch.hann_window,
409
- power: float = 2.0,
410
- normalized: bool = False,
411
- wkwargs: Optional[dict] = None,
412
- center: bool = True,
413
- pad_mode: str = "reflect",
414
- n_chroma: int = 12,
415
- tuning: float = 0.0,
416
- ctroct: float = 5.0,
417
- octwidth: Optional[float] = 2.0,
418
- norm: int = 2,
419
- base_c: bool = True,
420
- ):
421
- super().__init__()
422
- self.spectrogram = Spectrogram(
423
- n_fft=n_fft,
424
- win_length=win_length,
425
- hop_length=hop_length,
426
- pad=pad,
427
- window_fn=window_fn,
428
- power=power,
429
- normalized=normalized,
430
- wkwargs=wkwargs,
431
- center=center,
432
- pad_mode=pad_mode,
433
- onesided=True,
434
- )
435
- self.chroma_scale = ChromaScale(
436
- sample_rate,
437
- n_fft // 2 + 1,
438
- n_chroma=n_chroma,
439
- tuning=tuning,
440
- base_c=base_c,
441
- ctroct=ctroct,
442
- octwidth=octwidth,
443
- norm=norm,
444
- )
445
-
446
- def forward(self, waveform: torch.Tensor) -> torch.Tensor:
447
- r"""
448
- Args:
449
- waveform (Tensor): Tensor of audio of dimension (..., time).
450
-
451
- Returns:
452
- Tensor: Chromagram of size (..., ``n_chroma``, time).
453
- """
454
- spectrogram = self.spectrogram(waveform)
455
- chroma_spectrogram = self.chroma_scale(spectrogram)
456
- return chroma_spectrogram
@@ -1,10 +0,0 @@
1
- from .sox_effects import apply_effects_file, apply_effects_tensor, effect_names, init_sox_effects, shutdown_sox_effects
2
-
3
-
4
- __all__ = [
5
- "init_sox_effects",
6
- "shutdown_sox_effects",
7
- "effect_names",
8
- "apply_effects_tensor",
9
- "apply_effects_file",
10
- ]