torchaudio 2.8.0__cp312-cp312-win_amd64.whl → 2.9.0__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchaudio might be problematic. Click here for more details.
- torchaudio/__init__.py +179 -39
- torchaudio/_extension/__init__.py +1 -14
- torchaudio/_extension/utils.py +0 -47
- torchaudio/_internal/module_utils.py +12 -3
- torchaudio/_torchcodec.py +73 -85
- torchaudio/datasets/cmuarctic.py +1 -1
- torchaudio/datasets/utils.py +1 -1
- torchaudio/functional/__init__.py +0 -2
- torchaudio/functional/_alignment.py +1 -1
- torchaudio/functional/filtering.py +70 -55
- torchaudio/functional/functional.py +26 -60
- torchaudio/lib/_torchaudio.pyd +0 -0
- torchaudio/lib/libtorchaudio.pyd +0 -0
- torchaudio/models/decoder/__init__.py +14 -2
- torchaudio/models/decoder/_ctc_decoder.py +6 -6
- torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
- torchaudio/models/squim/objective.py +2 -2
- torchaudio/pipelines/_source_separation_pipeline.py +1 -1
- torchaudio/pipelines/_squim_pipeline.py +2 -2
- torchaudio/pipelines/_tts/utils.py +1 -1
- torchaudio/pipelines/rnnt_pipeline.py +4 -4
- torchaudio/transforms/__init__.py +1 -0
- torchaudio/transforms/_transforms.py +2 -2
- torchaudio/utils/__init__.py +2 -9
- torchaudio/utils/download.py +1 -3
- torchaudio/version.py +2 -2
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/METADATA +8 -11
- torchaudio-2.9.0.dist-info/RECORD +85 -0
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
- torchaudio/_backend/__init__.py +0 -61
- torchaudio/_backend/backend.py +0 -53
- torchaudio/_backend/common.py +0 -52
- torchaudio/_backend/ffmpeg.py +0 -334
- torchaudio/_backend/soundfile.py +0 -54
- torchaudio/_backend/soundfile_backend.py +0 -457
- torchaudio/_backend/sox.py +0 -91
- torchaudio/_backend/utils.py +0 -350
- torchaudio/backend/__init__.py +0 -8
- torchaudio/backend/_no_backend.py +0 -25
- torchaudio/backend/_sox_io_backend.py +0 -294
- torchaudio/backend/common.py +0 -13
- torchaudio/backend/no_backend.py +0 -14
- torchaudio/backend/soundfile_backend.py +0 -14
- torchaudio/backend/sox_io_backend.py +0 -14
- torchaudio/io/__init__.py +0 -20
- torchaudio/io/_effector.py +0 -347
- torchaudio/io/_playback.py +0 -72
- torchaudio/kaldi_io.py +0 -150
- torchaudio/prototype/__init__.py +0 -0
- torchaudio/prototype/datasets/__init__.py +0 -4
- torchaudio/prototype/datasets/musan.py +0 -68
- torchaudio/prototype/functional/__init__.py +0 -26
- torchaudio/prototype/functional/_dsp.py +0 -441
- torchaudio/prototype/functional/_rir.py +0 -382
- torchaudio/prototype/functional/functional.py +0 -193
- torchaudio/prototype/models/__init__.py +0 -39
- torchaudio/prototype/models/_conformer_wav2vec2.py +0 -801
- torchaudio/prototype/models/_emformer_hubert.py +0 -337
- torchaudio/prototype/models/conv_emformer.py +0 -529
- torchaudio/prototype/models/hifi_gan.py +0 -342
- torchaudio/prototype/models/rnnt.py +0 -717
- torchaudio/prototype/models/rnnt_decoder.py +0 -402
- torchaudio/prototype/pipelines/__init__.py +0 -21
- torchaudio/prototype/pipelines/_vggish/__init__.py +0 -7
- torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -236
- torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -83
- torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -233
- torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
- torchaudio/prototype/transforms/__init__.py +0 -9
- torchaudio/prototype/transforms/_transforms.py +0 -461
- torchaudio/sox_effects/__init__.py +0 -10
- torchaudio/sox_effects/sox_effects.py +0 -275
- torchaudio/utils/ffmpeg_utils.py +0 -11
- torchaudio/utils/sox_utils.py +0 -118
- torchaudio-2.8.0.dist-info/RECORD +0 -145
- torio/__init__.py +0 -8
- torio/_extension/__init__.py +0 -13
- torio/_extension/utils.py +0 -147
- torio/io/__init__.py +0 -9
- torio/io/_streaming_media_decoder.py +0 -977
- torio/io/_streaming_media_encoder.py +0 -502
- torio/lib/__init__.py +0 -0
- torio/lib/_torio_ffmpeg4.pyd +0 -0
- torio/lib/_torio_ffmpeg5.pyd +0 -0
- torio/lib/_torio_ffmpeg6.pyd +0 -0
- torio/lib/libtorio_ffmpeg4.pyd +0 -0
- torio/lib/libtorio_ffmpeg5.pyd +0 -0
- torio/lib/libtorio_ffmpeg6.pyd +0 -0
- torio/utils/__init__.py +0 -4
- torio/utils/ffmpeg_utils.py +0 -275
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
- {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,461 +0,0 @@
|
|
|
1
|
-
from typing import Callable, Optional
|
|
2
|
-
|
|
3
|
-
import torch
|
|
4
|
-
from torchaudio.prototype.functional import barkscale_fbanks, chroma_filterbank
|
|
5
|
-
from torchaudio.transforms import Spectrogram
|
|
6
|
-
from torchaudio._internal.module_utils import dropping_support, dropping_class_support
|
|
7
|
-
|
|
8
|
-
@dropping_class_support
|
|
9
|
-
class BarkScale(torch.nn.Module):
|
|
10
|
-
r"""Turn a normal STFT into a bark frequency STFT with triangular filter banks.
|
|
11
|
-
|
|
12
|
-
.. devices:: CPU CUDA
|
|
13
|
-
|
|
14
|
-
.. properties:: Autograd TorchScript
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
|
|
18
|
-
sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
|
|
19
|
-
f_min (float, optional): Minimum frequency. (Default: ``0.``)
|
|
20
|
-
f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
|
|
21
|
-
n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
|
|
22
|
-
norm (str or None, optional): If ``"slaney"``, divide the triangular bark weights by the width of the bark band
|
|
23
|
-
(area normalization). (Default: ``None``)
|
|
24
|
-
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
|
|
25
|
-
|
|
26
|
-
Example
|
|
27
|
-
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
|
|
28
|
-
>>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
|
|
29
|
-
>>> spectrogram = spectrogram_transform(waveform)
|
|
30
|
-
>>> barkscale_transform = transforms.BarkScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
|
|
31
|
-
>>> barkscale_spectrogram = barkscale_transform(spectrogram)
|
|
32
|
-
|
|
33
|
-
See also:
|
|
34
|
-
:py:func:`torchaudio.prototype.functional.barkscale_fbanks` - The function used to
|
|
35
|
-
generate the filter banks.
|
|
36
|
-
"""
|
|
37
|
-
__constants__ = ["n_barks", "sample_rate", "f_min", "f_max"]
|
|
38
|
-
|
|
39
|
-
def __init__(
|
|
40
|
-
self,
|
|
41
|
-
n_barks: int = 128,
|
|
42
|
-
sample_rate: int = 16000,
|
|
43
|
-
f_min: float = 0.0,
|
|
44
|
-
f_max: Optional[float] = None,
|
|
45
|
-
n_stft: int = 201,
|
|
46
|
-
bark_scale: str = "traunmuller",
|
|
47
|
-
) -> None:
|
|
48
|
-
super(BarkScale, self).__init__()
|
|
49
|
-
self.n_barks = n_barks
|
|
50
|
-
self.sample_rate = sample_rate
|
|
51
|
-
self.f_max = f_max if f_max is not None else float(sample_rate // 2)
|
|
52
|
-
self.f_min = f_min
|
|
53
|
-
self.bark_scale = bark_scale
|
|
54
|
-
|
|
55
|
-
if f_min > self.f_max:
|
|
56
|
-
raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
|
|
57
|
-
|
|
58
|
-
fb = barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, self.bark_scale)
|
|
59
|
-
self.register_buffer("fb", fb)
|
|
60
|
-
|
|
61
|
-
def forward(self, specgram: torch.Tensor) -> torch.Tensor:
|
|
62
|
-
r"""
|
|
63
|
-
Args:
|
|
64
|
-
specgram (torch.Tensor): A spectrogram STFT of dimension (..., freq, time).
|
|
65
|
-
|
|
66
|
-
Returns:
|
|
67
|
-
torch.Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
|
|
68
|
-
"""
|
|
69
|
-
|
|
70
|
-
# (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
|
|
71
|
-
bark_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2)
|
|
72
|
-
|
|
73
|
-
return bark_specgram
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
@dropping_class_support
|
|
77
|
-
class InverseBarkScale(torch.nn.Module):
|
|
78
|
-
r"""Estimate a STFT in normal frequency domain from bark frequency domain.
|
|
79
|
-
|
|
80
|
-
.. devices:: CPU CUDA
|
|
81
|
-
|
|
82
|
-
It minimizes the euclidian norm between the input bark-spectrogram and the product between
|
|
83
|
-
the estimated spectrogram and the filter banks using SGD.
|
|
84
|
-
|
|
85
|
-
Args:
|
|
86
|
-
n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
|
|
87
|
-
n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
|
|
88
|
-
sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
|
|
89
|
-
f_min (float, optional): Minimum frequency. (Default: ``0.``)
|
|
90
|
-
f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
|
|
91
|
-
max_iter (int, optional): Maximum number of optimization iterations. (Default: ``100000``)
|
|
92
|
-
tolerance_loss (float, optional): Value of loss to stop optimization at. (Default: ``1e-5``)
|
|
93
|
-
tolerance_change (float, optional): Difference in losses to stop optimization at. (Default: ``1e-8``)
|
|
94
|
-
sgdargs (dict or None, optional): Arguments for the SGD optimizer. (Default: ``None``)
|
|
95
|
-
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
|
|
96
|
-
|
|
97
|
-
Example
|
|
98
|
-
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
|
|
99
|
-
>>> mel_spectrogram_transform = transforms.BarkSpectrogram(sample_rate, n_fft=1024)
|
|
100
|
-
>>> mel_spectrogram = bark_spectrogram_transform(waveform)
|
|
101
|
-
>>> inverse_barkscale_transform = transforms.InverseBarkScale(n_stft=1024 // 2 + 1)
|
|
102
|
-
>>> spectrogram = inverse_barkscale_transform(mel_spectrogram)
|
|
103
|
-
"""
|
|
104
|
-
__constants__ = [
|
|
105
|
-
"n_stft",
|
|
106
|
-
"n_barks",
|
|
107
|
-
"sample_rate",
|
|
108
|
-
"f_min",
|
|
109
|
-
"f_max",
|
|
110
|
-
"max_iter",
|
|
111
|
-
"tolerance_loss",
|
|
112
|
-
"tolerance_change",
|
|
113
|
-
"sgdargs",
|
|
114
|
-
]
|
|
115
|
-
|
|
116
|
-
def __init__(
|
|
117
|
-
self,
|
|
118
|
-
n_stft: int,
|
|
119
|
-
n_barks: int = 128,
|
|
120
|
-
sample_rate: int = 16000,
|
|
121
|
-
f_min: float = 0.0,
|
|
122
|
-
f_max: Optional[float] = None,
|
|
123
|
-
max_iter: int = 100000,
|
|
124
|
-
tolerance_loss: float = 1e-5,
|
|
125
|
-
tolerance_change: float = 1e-8,
|
|
126
|
-
sgdargs: Optional[dict] = None,
|
|
127
|
-
bark_scale: str = "traunmuller",
|
|
128
|
-
) -> None:
|
|
129
|
-
super(InverseBarkScale, self).__init__()
|
|
130
|
-
self.n_barks = n_barks
|
|
131
|
-
self.sample_rate = sample_rate
|
|
132
|
-
self.f_max = f_max or float(sample_rate // 2)
|
|
133
|
-
self.f_min = f_min
|
|
134
|
-
self.max_iter = max_iter
|
|
135
|
-
self.tolerance_loss = tolerance_loss
|
|
136
|
-
self.tolerance_change = tolerance_change
|
|
137
|
-
self.sgdargs = sgdargs or {"lr": 0.1, "momentum": 0.9}
|
|
138
|
-
|
|
139
|
-
if f_min > self.f_max:
|
|
140
|
-
raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
|
|
141
|
-
|
|
142
|
-
fb = barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, bark_scale)
|
|
143
|
-
self.register_buffer("fb", fb)
|
|
144
|
-
|
|
145
|
-
def forward(self, barkspec: torch.Tensor) -> torch.Tensor:
|
|
146
|
-
r"""
|
|
147
|
-
Args:
|
|
148
|
-
barkspec (torch.Tensor): A Bark frequency spectrogram of dimension (..., ``n_barks``, time)
|
|
149
|
-
|
|
150
|
-
Returns:
|
|
151
|
-
torch.Tensor: Linear scale spectrogram of size (..., freq, time)
|
|
152
|
-
"""
|
|
153
|
-
# pack batch
|
|
154
|
-
shape = barkspec.size()
|
|
155
|
-
barkspec = barkspec.view(-1, shape[-2], shape[-1])
|
|
156
|
-
|
|
157
|
-
n_barks, time = shape[-2], shape[-1]
|
|
158
|
-
freq, _ = self.fb.size() # (freq, n_mels)
|
|
159
|
-
barkspec = barkspec.transpose(-1, -2)
|
|
160
|
-
if self.n_barks != n_barks:
|
|
161
|
-
raise ValueError("Expected an input with {} bark bins. Found: {}".format(self.n_barks, n_barks))
|
|
162
|
-
|
|
163
|
-
specgram = torch.rand(
|
|
164
|
-
barkspec.size()[0], time, freq, requires_grad=True, dtype=barkspec.dtype, device=barkspec.device
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
optim = torch.optim.SGD([specgram], **self.sgdargs)
|
|
168
|
-
|
|
169
|
-
loss = float("inf")
|
|
170
|
-
for _ in range(self.max_iter):
|
|
171
|
-
optim.zero_grad()
|
|
172
|
-
diff = barkspec - specgram.matmul(self.fb)
|
|
173
|
-
new_loss = diff.pow(2).sum(axis=-1).mean()
|
|
174
|
-
# take sum over bark-frequency then average over other dimensions
|
|
175
|
-
# so that loss threshold is applied par unit timeframe
|
|
176
|
-
new_loss.backward()
|
|
177
|
-
optim.step()
|
|
178
|
-
specgram.data = specgram.data.clamp(min=0)
|
|
179
|
-
|
|
180
|
-
new_loss = new_loss.item()
|
|
181
|
-
if new_loss < self.tolerance_loss or abs(loss - new_loss) < self.tolerance_change:
|
|
182
|
-
break
|
|
183
|
-
loss = new_loss
|
|
184
|
-
|
|
185
|
-
specgram.requires_grad_(False)
|
|
186
|
-
specgram = specgram.clamp(min=0).transpose(-1, -2)
|
|
187
|
-
|
|
188
|
-
# unpack batch
|
|
189
|
-
specgram = specgram.view(shape[:-2] + (freq, time))
|
|
190
|
-
return specgram
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
@dropping_class_support
|
|
194
|
-
class BarkSpectrogram(torch.nn.Module):
|
|
195
|
-
r"""Create BarkSpectrogram for a raw audio signal.
|
|
196
|
-
|
|
197
|
-
.. devices:: CPU CUDA
|
|
198
|
-
|
|
199
|
-
.. properties:: Autograd TorchScript
|
|
200
|
-
|
|
201
|
-
This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and
|
|
202
|
-
and :py:func:`torchaudio.transforms.BarkScale`.
|
|
203
|
-
|
|
204
|
-
Sources
|
|
205
|
-
* https://www.fon.hum.uva.nl/praat/manual/BarkSpectrogram.html
|
|
206
|
-
* Traunmüller, Hartmut. "Analytical Expressions for the Tonotopic Sensory Scale." Journal of the Acoustical
|
|
207
|
-
* Society of America. Vol. 88, Issue 1, 1990, pp. 97–100.
|
|
208
|
-
* https://ccrma.stanford.edu/courses/120-fall-2003/lecture-5.html
|
|
209
|
-
|
|
210
|
-
Args:
|
|
211
|
-
sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
|
|
212
|
-
n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
|
|
213
|
-
win_length (int or None, optional): Window size. (Default: ``n_fft``)
|
|
214
|
-
hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
|
|
215
|
-
f_min (float, optional): Minimum frequency. (Default: ``0.``)
|
|
216
|
-
f_max (float or None, optional): Maximum frequency. (Default: ``None``)
|
|
217
|
-
pad (int, optional): Two sided padding of signal. (Default: ``0``)
|
|
218
|
-
n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
|
|
219
|
-
window_fn (Callable[..., torch.Tensor], optional): A function to create a window tensor
|
|
220
|
-
that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
|
|
221
|
-
power (float, optional): Exponent for the magnitude spectrogram,
|
|
222
|
-
(must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
|
|
223
|
-
normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
|
|
224
|
-
wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
|
|
225
|
-
center (bool, optional): whether to pad :attr:`waveform` on both sides so
|
|
226
|
-
that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
|
|
227
|
-
(Default: ``True``)
|
|
228
|
-
pad_mode (string, optional): controls the padding method used when
|
|
229
|
-
:attr:`center` is ``True``. (Default: ``"reflect"``)
|
|
230
|
-
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
|
|
231
|
-
|
|
232
|
-
Example
|
|
233
|
-
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
|
|
234
|
-
>>> transform = transforms.BarkSpectrogram(sample_rate)
|
|
235
|
-
>>> bark_specgram = transform(waveform) # (channel, n_barks, time)
|
|
236
|
-
|
|
237
|
-
See also:
|
|
238
|
-
:py:func:`torchaudio.functional.melscale_fbanks` - The function used to
|
|
239
|
-
generate the filter banks.
|
|
240
|
-
"""
|
|
241
|
-
__constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_barks", "f_min"]
|
|
242
|
-
|
|
243
|
-
def __init__(
|
|
244
|
-
self,
|
|
245
|
-
sample_rate: int = 16000,
|
|
246
|
-
n_fft: int = 400,
|
|
247
|
-
win_length: Optional[int] = None,
|
|
248
|
-
hop_length: Optional[int] = None,
|
|
249
|
-
f_min: float = 0.0,
|
|
250
|
-
f_max: Optional[float] = None,
|
|
251
|
-
pad: int = 0,
|
|
252
|
-
n_barks: int = 128,
|
|
253
|
-
window_fn: Callable[..., torch.Tensor] = torch.hann_window,
|
|
254
|
-
power: float = 2.0,
|
|
255
|
-
normalized: bool = False,
|
|
256
|
-
wkwargs: Optional[dict] = None,
|
|
257
|
-
center: bool = True,
|
|
258
|
-
pad_mode: str = "reflect",
|
|
259
|
-
bark_scale: str = "traunmuller",
|
|
260
|
-
) -> None:
|
|
261
|
-
super(BarkSpectrogram, self).__init__()
|
|
262
|
-
|
|
263
|
-
self.sample_rate = sample_rate
|
|
264
|
-
self.n_fft = n_fft
|
|
265
|
-
self.win_length = win_length if win_length is not None else n_fft
|
|
266
|
-
self.hop_length = hop_length if hop_length is not None else self.win_length // 2
|
|
267
|
-
self.pad = pad
|
|
268
|
-
self.power = power
|
|
269
|
-
self.normalized = normalized
|
|
270
|
-
self.n_barks = n_barks # number of bark frequency bins
|
|
271
|
-
self.f_max = f_max
|
|
272
|
-
self.f_min = f_min
|
|
273
|
-
self.spectrogram = Spectrogram(
|
|
274
|
-
n_fft=self.n_fft,
|
|
275
|
-
win_length=self.win_length,
|
|
276
|
-
hop_length=self.hop_length,
|
|
277
|
-
pad=self.pad,
|
|
278
|
-
window_fn=window_fn,
|
|
279
|
-
power=self.power,
|
|
280
|
-
normalized=self.normalized,
|
|
281
|
-
wkwargs=wkwargs,
|
|
282
|
-
center=center,
|
|
283
|
-
pad_mode=pad_mode,
|
|
284
|
-
onesided=True,
|
|
285
|
-
)
|
|
286
|
-
self.bark_scale = BarkScale(
|
|
287
|
-
self.n_barks, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, bark_scale
|
|
288
|
-
)
|
|
289
|
-
|
|
290
|
-
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
|
|
291
|
-
r"""
|
|
292
|
-
Args:
|
|
293
|
-
waveform (torch.Tensor): torch.Tensor of audio of dimension (..., time).
|
|
294
|
-
|
|
295
|
-
Returns:
|
|
296
|
-
torch.Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
|
|
297
|
-
"""
|
|
298
|
-
specgram = self.spectrogram(waveform)
|
|
299
|
-
bark_specgram = self.bark_scale(specgram)
|
|
300
|
-
return bark_specgram
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
@dropping_class_support
|
|
304
|
-
class ChromaScale(torch.nn.Module):
|
|
305
|
-
r"""Converts spectrogram to chromagram.
|
|
306
|
-
|
|
307
|
-
.. devices:: CPU CUDA
|
|
308
|
-
|
|
309
|
-
.. properties:: Autograd
|
|
310
|
-
|
|
311
|
-
Args:
|
|
312
|
-
sample_rate (int): Sample rate of audio signal.
|
|
313
|
-
n_freqs (int): Number of frequency bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
|
|
314
|
-
n_chroma (int, optional): Number of chroma. (Default: ``12``)
|
|
315
|
-
tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
|
|
316
|
-
ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
|
|
317
|
-
octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
|
|
318
|
-
If ``None``, then disable weighting altogether. (Default: 2.0)
|
|
319
|
-
norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
|
|
320
|
-
base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
|
|
321
|
-
|
|
322
|
-
Example
|
|
323
|
-
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
|
|
324
|
-
>>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
|
|
325
|
-
>>> spectrogram = spectrogram_transform(waveform)
|
|
326
|
-
>>> chroma_transform = transforms.ChromaScale(sample_rate=sample_rate, n_freqs=1024 // 2 + 1)
|
|
327
|
-
>>> chroma_spectrogram = chroma_transform(spectrogram)
|
|
328
|
-
|
|
329
|
-
See also:
|
|
330
|
-
:py:func:`torchaudio.prototype.functional.chroma_filterbank` — function used to
|
|
331
|
-
generate the filter bank.
|
|
332
|
-
"""
|
|
333
|
-
|
|
334
|
-
def __init__(
|
|
335
|
-
self,
|
|
336
|
-
sample_rate: int,
|
|
337
|
-
n_freqs: int,
|
|
338
|
-
*,
|
|
339
|
-
n_chroma: int = 12,
|
|
340
|
-
tuning: float = 0.0,
|
|
341
|
-
ctroct: float = 5.0,
|
|
342
|
-
octwidth: Optional[float] = 2.0,
|
|
343
|
-
norm: int = 2,
|
|
344
|
-
base_c: bool = True,
|
|
345
|
-
):
|
|
346
|
-
super().__init__()
|
|
347
|
-
fb = chroma_filterbank(
|
|
348
|
-
sample_rate, n_freqs, n_chroma, tuning=tuning, ctroct=ctroct, octwidth=octwidth, norm=norm, base_c=base_c
|
|
349
|
-
)
|
|
350
|
-
self.register_buffer("fb", fb)
|
|
351
|
-
|
|
352
|
-
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
353
|
-
r"""
|
|
354
|
-
Args:
|
|
355
|
-
specgram (torch.Tensor): Spectrogram of dimension (..., ``n_freqs``, time).
|
|
356
|
-
|
|
357
|
-
Returns:
|
|
358
|
-
torch.Tensor: Chroma spectrogram of size (..., ``n_chroma``, time).
|
|
359
|
-
"""
|
|
360
|
-
return torch.matmul(x.transpose(-1, -2), self.fb).transpose(-1, -2)
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
@dropping_class_support
|
|
364
|
-
class ChromaSpectrogram(torch.nn.Module):
|
|
365
|
-
r"""Generates chromagram for audio signal.
|
|
366
|
-
|
|
367
|
-
.. devices:: CPU CUDA
|
|
368
|
-
|
|
369
|
-
.. properties:: Autograd
|
|
370
|
-
|
|
371
|
-
Composes :py:func:`torchaudio.transforms.Spectrogram` and
|
|
372
|
-
and :py:func:`torchaudio.prototype.transforms.ChromaScale`.
|
|
373
|
-
|
|
374
|
-
Args:
|
|
375
|
-
sample_rate (int): Sample rate of audio signal.
|
|
376
|
-
n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins.
|
|
377
|
-
win_length (int or None, optional): Window size. (Default: ``n_fft``)
|
|
378
|
-
hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
|
|
379
|
-
pad (int, optional): Two sided padding of signal. (Default: ``0``)
|
|
380
|
-
window_fn (Callable[..., torch.Tensor], optional): A function to create a window tensor
|
|
381
|
-
that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
|
|
382
|
-
power (float, optional): Exponent for the magnitude spectrogram,
|
|
383
|
-
(must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
|
|
384
|
-
normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
|
|
385
|
-
wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
|
|
386
|
-
center (bool, optional): whether to pad :attr:`waveform` on both sides so
|
|
387
|
-
that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
|
|
388
|
-
(Default: ``True``)
|
|
389
|
-
pad_mode (string, optional): controls the padding method used when
|
|
390
|
-
:attr:`center` is ``True``. (Default: ``"reflect"``)
|
|
391
|
-
n_chroma (int, optional): Number of chroma. (Default: ``12``)
|
|
392
|
-
tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
|
|
393
|
-
ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
|
|
394
|
-
octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
|
|
395
|
-
If ``None``, then disable weighting altogether. (Default: 2.0)
|
|
396
|
-
norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
|
|
397
|
-
base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
|
|
398
|
-
|
|
399
|
-
Example
|
|
400
|
-
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
|
|
401
|
-
>>> transform = transforms.ChromaSpectrogram(sample_rate=sample_rate, n_fft=400)
|
|
402
|
-
>>> chromagram = transform(waveform) # (channel, n_chroma, time)
|
|
403
|
-
"""
|
|
404
|
-
|
|
405
|
-
def __init__(
|
|
406
|
-
self,
|
|
407
|
-
sample_rate: int,
|
|
408
|
-
n_fft: int,
|
|
409
|
-
*,
|
|
410
|
-
win_length: Optional[int] = None,
|
|
411
|
-
hop_length: Optional[int] = None,
|
|
412
|
-
pad: int = 0,
|
|
413
|
-
window_fn: Callable[..., torch.Tensor] = torch.hann_window,
|
|
414
|
-
power: float = 2.0,
|
|
415
|
-
normalized: bool = False,
|
|
416
|
-
wkwargs: Optional[dict] = None,
|
|
417
|
-
center: bool = True,
|
|
418
|
-
pad_mode: str = "reflect",
|
|
419
|
-
n_chroma: int = 12,
|
|
420
|
-
tuning: float = 0.0,
|
|
421
|
-
ctroct: float = 5.0,
|
|
422
|
-
octwidth: Optional[float] = 2.0,
|
|
423
|
-
norm: int = 2,
|
|
424
|
-
base_c: bool = True,
|
|
425
|
-
):
|
|
426
|
-
super().__init__()
|
|
427
|
-
self.spectrogram = Spectrogram(
|
|
428
|
-
n_fft=n_fft,
|
|
429
|
-
win_length=win_length,
|
|
430
|
-
hop_length=hop_length,
|
|
431
|
-
pad=pad,
|
|
432
|
-
window_fn=window_fn,
|
|
433
|
-
power=power,
|
|
434
|
-
normalized=normalized,
|
|
435
|
-
wkwargs=wkwargs,
|
|
436
|
-
center=center,
|
|
437
|
-
pad_mode=pad_mode,
|
|
438
|
-
onesided=True,
|
|
439
|
-
)
|
|
440
|
-
self.chroma_scale = ChromaScale(
|
|
441
|
-
sample_rate,
|
|
442
|
-
n_fft // 2 + 1,
|
|
443
|
-
n_chroma=n_chroma,
|
|
444
|
-
tuning=tuning,
|
|
445
|
-
base_c=base_c,
|
|
446
|
-
ctroct=ctroct,
|
|
447
|
-
octwidth=octwidth,
|
|
448
|
-
norm=norm,
|
|
449
|
-
)
|
|
450
|
-
|
|
451
|
-
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
|
|
452
|
-
r"""
|
|
453
|
-
Args:
|
|
454
|
-
waveform (Tensor): Tensor of audio of dimension (..., time).
|
|
455
|
-
|
|
456
|
-
Returns:
|
|
457
|
-
Tensor: Chromagram of size (..., ``n_chroma``, time).
|
|
458
|
-
"""
|
|
459
|
-
spectrogram = self.spectrogram(waveform)
|
|
460
|
-
chroma_spectrogram = self.chroma_scale(spectrogram)
|
|
461
|
-
return chroma_spectrogram
|