torchaudio 2.7.1__cp311-cp311-win_amd64.whl → 2.9.0__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchaudio might be problematic. Click here for more details.
- torchaudio/__init__.py +184 -33
- torchaudio/_extension/__init__.py +1 -14
- torchaudio/_extension/utils.py +0 -47
- torchaudio/_internal/module_utils.py +68 -10
- torchaudio/_torchcodec.py +340 -0
- torchaudio/datasets/cmuarctic.py +1 -1
- torchaudio/datasets/utils.py +1 -1
- torchaudio/functional/__init__.py +6 -3
- torchaudio/functional/_alignment.py +1 -1
- torchaudio/functional/filtering.py +70 -55
- torchaudio/functional/functional.py +31 -61
- torchaudio/lib/_torchaudio.pyd +0 -0
- torchaudio/lib/libtorchaudio.pyd +0 -0
- torchaudio/models/decoder/__init__.py +19 -1
- torchaudio/models/decoder/_ctc_decoder.py +6 -6
- torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
- torchaudio/models/squim/objective.py +2 -2
- torchaudio/pipelines/_source_separation_pipeline.py +1 -1
- torchaudio/pipelines/_squim_pipeline.py +2 -2
- torchaudio/pipelines/_tts/utils.py +3 -1
- torchaudio/pipelines/rnnt_pipeline.py +4 -4
- torchaudio/transforms/__init__.py +4 -1
- torchaudio/transforms/_transforms.py +4 -3
- torchaudio/utils/__init__.py +2 -9
- torchaudio/utils/download.py +1 -1
- torchaudio/version.py +2 -2
- {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/METADATA +15 -7
- torchaudio-2.9.0.dist-info/RECORD +85 -0
- {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
- torchaudio/_backend/__init__.py +0 -61
- torchaudio/_backend/backend.py +0 -53
- torchaudio/_backend/common.py +0 -52
- torchaudio/_backend/ffmpeg.py +0 -334
- torchaudio/_backend/soundfile.py +0 -54
- torchaudio/_backend/soundfile_backend.py +0 -457
- torchaudio/_backend/sox.py +0 -91
- torchaudio/_backend/utils.py +0 -317
- torchaudio/backend/__init__.py +0 -8
- torchaudio/backend/_no_backend.py +0 -25
- torchaudio/backend/_sox_io_backend.py +0 -294
- torchaudio/backend/common.py +0 -13
- torchaudio/backend/no_backend.py +0 -14
- torchaudio/backend/soundfile_backend.py +0 -14
- torchaudio/backend/sox_io_backend.py +0 -14
- torchaudio/io/__init__.py +0 -13
- torchaudio/io/_effector.py +0 -347
- torchaudio/io/_playback.py +0 -72
- torchaudio/kaldi_io.py +0 -144
- torchaudio/prototype/__init__.py +0 -0
- torchaudio/prototype/datasets/__init__.py +0 -4
- torchaudio/prototype/datasets/musan.py +0 -67
- torchaudio/prototype/functional/__init__.py +0 -26
- torchaudio/prototype/functional/_dsp.py +0 -433
- torchaudio/prototype/functional/_rir.py +0 -379
- torchaudio/prototype/functional/functional.py +0 -190
- torchaudio/prototype/models/__init__.py +0 -36
- torchaudio/prototype/models/_conformer_wav2vec2.py +0 -794
- torchaudio/prototype/models/_emformer_hubert.py +0 -333
- torchaudio/prototype/models/conv_emformer.py +0 -525
- torchaudio/prototype/models/hifi_gan.py +0 -336
- torchaudio/prototype/models/rnnt.py +0 -711
- torchaudio/prototype/models/rnnt_decoder.py +0 -399
- torchaudio/prototype/pipelines/__init__.py +0 -12
- torchaudio/prototype/pipelines/_vggish/__init__.py +0 -3
- torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -233
- torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -82
- torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -228
- torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
- torchaudio/prototype/transforms/__init__.py +0 -9
- torchaudio/prototype/transforms/_transforms.py +0 -456
- torchaudio/sox_effects/__init__.py +0 -10
- torchaudio/sox_effects/sox_effects.py +0 -272
- torchaudio/utils/ffmpeg_utils.py +0 -11
- torchaudio/utils/sox_utils.py +0 -99
- torchaudio-2.7.1.dist-info/RECORD +0 -144
- torio/__init__.py +0 -8
- torio/_extension/__init__.py +0 -13
- torio/_extension/utils.py +0 -147
- torio/io/__init__.py +0 -9
- torio/io/_streaming_media_decoder.py +0 -978
- torio/io/_streaming_media_encoder.py +0 -502
- torio/lib/__init__.py +0 -0
- torio/lib/_torio_ffmpeg4.pyd +0 -0
- torio/lib/_torio_ffmpeg5.pyd +0 -0
- torio/lib/_torio_ffmpeg6.pyd +0 -0
- torio/lib/libtorio_ffmpeg4.pyd +0 -0
- torio/lib/libtorio_ffmpeg5.pyd +0 -0
- torio/lib/libtorio_ffmpeg6.pyd +0 -0
- torio/utils/__init__.py +0 -4
- torio/utils/ffmpeg_utils.py +0 -247
- {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
- {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,379 +0,0 @@
|
|
|
1
|
-
import math
|
|
2
|
-
from typing import Optional, Tuple, Union
|
|
3
|
-
|
|
4
|
-
import torch
|
|
5
|
-
import torchaudio
|
|
6
|
-
from torch import Tensor
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def _compute_image_sources(
|
|
10
|
-
room: torch.Tensor,
|
|
11
|
-
source: torch.Tensor,
|
|
12
|
-
max_order: int,
|
|
13
|
-
absorption: torch.Tensor,
|
|
14
|
-
scatter: Optional[torch.Tensor] = None,
|
|
15
|
-
) -> Tuple[Tensor, Tensor]:
|
|
16
|
-
"""Compute image sources in a shoebox-like room.
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
room (torch.Tensor): The 1D Tensor to determine the room size. The shape is
|
|
20
|
-
`(D,)`, where ``D`` is 2 if room is a 2D room, or 3 if room is a 3D room.
|
|
21
|
-
source (torch.Tensor): The coordinate of the sound source. Tensor with dimensions
|
|
22
|
-
`(D)`.
|
|
23
|
-
max_order (int): The maximum number of reflections of the source.
|
|
24
|
-
absorption (torch.Tensor): The absorption coefficients of wall materials.
|
|
25
|
-
``absorption`` is a Tensor with dimensions `(num_band, num_wall)`.
|
|
26
|
-
The shape options are ``[(1, 4), (1, 6), (7, 4), (7, 6)]``.
|
|
27
|
-
``num_band`` is `1` if the coefficients is the same for all frequencies, or is `7`
|
|
28
|
-
if the coefficients are different to different frequencies. `7` refers to the default number
|
|
29
|
-
of octave bands. (See note in `simulate_rir_ism` method).
|
|
30
|
-
``num_wall`` is `4` if the room is a 2D room, representing absorption coefficients
|
|
31
|
-
of ``"west"``, ``"east"``, ``"south"``, and ``"north"`` walls, respectively.
|
|
32
|
-
Or it is `6` if the room is a 3D room, representing absorption coefficients
|
|
33
|
-
of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively.
|
|
34
|
-
scatter (torch.Tensor): The scattering coefficients of wall materials.
|
|
35
|
-
The shape of ``scatter`` must match that of ``absorption``. If ``None``, it is not
|
|
36
|
-
used in image source computation. (Default: ``None``)
|
|
37
|
-
|
|
38
|
-
Returns:
|
|
39
|
-
(torch.Tensor): The coordinates of all image sources within ``max_order`` number of reflections.
|
|
40
|
-
Tensor with dimensions `(num_image_source, D)`.
|
|
41
|
-
(torch.Tensor): The attenuation of corresponding image sources. Tensor with dimensions
|
|
42
|
-
`(num_band, num_image_source)`.
|
|
43
|
-
"""
|
|
44
|
-
if scatter is None:
|
|
45
|
-
tr = torch.sqrt(1 - absorption)
|
|
46
|
-
else:
|
|
47
|
-
tr = torch.sqrt(1 - absorption) * torch.sqrt(1 - scatter)
|
|
48
|
-
|
|
49
|
-
ind = torch.arange(-max_order, max_order + 1, device=source.device)
|
|
50
|
-
if room.shape[0] == 2:
|
|
51
|
-
XYZ = torch.meshgrid(ind, ind, indexing="ij")
|
|
52
|
-
else:
|
|
53
|
-
XYZ = torch.meshgrid(ind, ind, ind, indexing="ij")
|
|
54
|
-
XYZ = torch.stack([c.reshape((-1,)) for c in XYZ], dim=-1)
|
|
55
|
-
XYZ = XYZ[XYZ.abs().sum(dim=-1) <= max_order]
|
|
56
|
-
|
|
57
|
-
# compute locations of image sources
|
|
58
|
-
d = room[None, :]
|
|
59
|
-
s = source[None, :]
|
|
60
|
-
img_loc = torch.where(XYZ % 2 == 1, d * (XYZ + 1) - s, d * XYZ + s)
|
|
61
|
-
|
|
62
|
-
# attenuation
|
|
63
|
-
exp_lo = abs(torch.floor((XYZ / 2)))
|
|
64
|
-
exp_hi = abs(torch.floor((XYZ + 1) / 2))
|
|
65
|
-
t_lo = tr[:, ::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1) # (num_band, left walls)
|
|
66
|
-
t_hi = tr[:, 1::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1) # (num_band, right walls)
|
|
67
|
-
att = torch.prod((t_lo**exp_lo) * (t_hi**exp_hi), dim=-1) # (num_band, num_image_source)
|
|
68
|
-
return img_loc, att
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def _hann(x: torch.Tensor, T: int):
|
|
72
|
-
"""Compute the Hann window where the values are truncated based on window length.
|
|
73
|
-
torch.hann_window can only sample window function at integer points, the method is to sample
|
|
74
|
-
continuous window function at non-integer points.
|
|
75
|
-
|
|
76
|
-
Args:
|
|
77
|
-
x (torch.Tensor): The fractional component of time delay Tensor.
|
|
78
|
-
T (torch.Tensor): The window length of sinc function.
|
|
79
|
-
|
|
80
|
-
Returns:
|
|
81
|
-
(torch.Tensor): The hann window Tensor where values outside
|
|
82
|
-
the sinc window (`T`) is set to zero.
|
|
83
|
-
"""
|
|
84
|
-
y = torch.where(
|
|
85
|
-
torch.abs(x) <= T / 2,
|
|
86
|
-
0.5 * (1 + torch.cos(2 * math.pi * x / T)),
|
|
87
|
-
x.new_zeros(1),
|
|
88
|
-
)
|
|
89
|
-
return y
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def _frac_delay(delay: torch.Tensor, delay_i: torch.Tensor, delay_filter_length: int):
|
|
93
|
-
"""Compute fractional delay of impulse response signal.
|
|
94
|
-
|
|
95
|
-
Args:
|
|
96
|
-
delay (torch.Tensor): The time delay Tensor in samples.
|
|
97
|
-
delay_i (torch.Tensor): The integer part of delay.
|
|
98
|
-
delay_filter_length (int): The window length for sinc function.
|
|
99
|
-
|
|
100
|
-
Returns:
|
|
101
|
-
(torch.Tensor): The impulse response Tensor for all image sources.
|
|
102
|
-
"""
|
|
103
|
-
if delay_filter_length % 2 != 1:
|
|
104
|
-
raise ValueError("The filter length must be odd")
|
|
105
|
-
|
|
106
|
-
pad = delay_filter_length // 2
|
|
107
|
-
n = torch.arange(-pad, pad + 1, device=delay.device) + delay_i[..., None]
|
|
108
|
-
delay = delay[..., None]
|
|
109
|
-
|
|
110
|
-
return torch.special.sinc(n - delay) * _hann(n - delay, 2 * pad)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def _adjust_coeff(coeffs: Union[float, torch.Tensor], name: str) -> torch.Tensor:
|
|
114
|
-
"""Validates and converts absorption or scattering parameters to a tensor with appropriate shape
|
|
115
|
-
|
|
116
|
-
Args:
|
|
117
|
-
coeff (float or torch.Tensor): The absorption coefficients of wall materials.
|
|
118
|
-
|
|
119
|
-
If the dtype is ``float``, the absorption coefficient is identical for all walls and
|
|
120
|
-
all frequencies.
|
|
121
|
-
|
|
122
|
-
If ``absorption`` is a 1D Tensor, the shape must be `(2*dim,)`,
|
|
123
|
-
where the values represent absorption coefficients of ``"west"``, ``"east"``,
|
|
124
|
-
``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively.
|
|
125
|
-
|
|
126
|
-
If ``absorption`` is a 2D Tensor, the shape must be `(7, 2*dim)`,
|
|
127
|
-
where 7 represents the number of octave bands.
|
|
128
|
-
|
|
129
|
-
Returns:
|
|
130
|
-
(torch.Tensor): The expanded coefficient.
|
|
131
|
-
The shape is `(1, 6)` for single octave band case, and
|
|
132
|
-
`(7, 6)` for multi octave band case.
|
|
133
|
-
"""
|
|
134
|
-
num_walls = 6
|
|
135
|
-
if isinstance(coeffs, float):
|
|
136
|
-
if coeffs < 0:
|
|
137
|
-
raise ValueError(f"`{name}` must be non-negative. Found: {coeffs}")
|
|
138
|
-
return torch.full((1, num_walls), coeffs)
|
|
139
|
-
if isinstance(coeffs, Tensor):
|
|
140
|
-
if torch.any(coeffs < 0):
|
|
141
|
-
raise ValueError(f"`{name}` must be non-negative. Found: {coeffs}")
|
|
142
|
-
if coeffs.ndim == 1:
|
|
143
|
-
if coeffs.numel() != num_walls:
|
|
144
|
-
raise ValueError(
|
|
145
|
-
f"The shape of `{name}` must be ({num_walls},) when it is a 1D Tensor. "
|
|
146
|
-
f"Found the shape {coeffs.shape}."
|
|
147
|
-
)
|
|
148
|
-
return coeffs.unsqueeze(0)
|
|
149
|
-
if coeffs.ndim == 2:
|
|
150
|
-
if coeffs.shape[1] != num_walls:
|
|
151
|
-
raise ValueError(
|
|
152
|
-
f"The shape of `{name}` must be (NUM_BANDS, {num_walls}) when it "
|
|
153
|
-
f"is a 2D Tensor. Found: {coeffs.shape}."
|
|
154
|
-
)
|
|
155
|
-
return coeffs
|
|
156
|
-
raise TypeError(f"`{name}` must be float or Tensor.")
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def _validate_inputs(
|
|
160
|
-
room: torch.Tensor,
|
|
161
|
-
source: torch.Tensor,
|
|
162
|
-
mic_array: torch.Tensor,
|
|
163
|
-
):
|
|
164
|
-
"""Validate dimensions of input arguments, and normalize different kinds of absorption into the same dimension.
|
|
165
|
-
|
|
166
|
-
Args:
|
|
167
|
-
room (torch.Tensor): The size of the room. width, length (and height)
|
|
168
|
-
source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(dim,)`.
|
|
169
|
-
mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, dim)`.
|
|
170
|
-
"""
|
|
171
|
-
if not (room.ndim == 1 and room.numel() == 3):
|
|
172
|
-
raise ValueError(f"`room` must be a 1D Tensor with 3 elements. Found {room.shape}.")
|
|
173
|
-
if not (source.ndim == 1 and source.numel() == 3):
|
|
174
|
-
raise ValueError(f"`source` must be 1D Tensor with 3 elements. Found {source.shape}.")
|
|
175
|
-
if not (mic_array.ndim == 2 and mic_array.shape[1] == 3):
|
|
176
|
-
raise ValueError(f"`mic_array` must be a 2D Tensor with shape (num_channels, 3). Found {mic_array.shape}.")
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
def simulate_rir_ism(
|
|
180
|
-
room: torch.Tensor,
|
|
181
|
-
source: torch.Tensor,
|
|
182
|
-
mic_array: torch.Tensor,
|
|
183
|
-
max_order: int,
|
|
184
|
-
absorption: Union[float, torch.Tensor],
|
|
185
|
-
output_length: Optional[int] = None,
|
|
186
|
-
delay_filter_length: int = 81,
|
|
187
|
-
center_frequency: Optional[torch.Tensor] = None,
|
|
188
|
-
sound_speed: float = 343.0,
|
|
189
|
-
sample_rate: float = 16000.0,
|
|
190
|
-
) -> Tensor:
|
|
191
|
-
r"""Compute Room Impulse Response (RIR) based on the *image source method* :cite:`allen1979image`.
|
|
192
|
-
The implementation is based on *pyroomacoustics* :cite:`scheibler2018pyroomacoustics`.
|
|
193
|
-
|
|
194
|
-
.. devices:: CPU
|
|
195
|
-
|
|
196
|
-
.. properties:: TorchScript
|
|
197
|
-
|
|
198
|
-
Args:
|
|
199
|
-
room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents
|
|
200
|
-
three dimensions of the room.
|
|
201
|
-
source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`.
|
|
202
|
-
mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`.
|
|
203
|
-
max_order (int): The maximum number of reflections of the source.
|
|
204
|
-
absorption (float or torch.Tensor): The *absorption* :cite:`wiki:Absorption_(acoustics)`
|
|
205
|
-
coefficients of wall materials for sound energy.
|
|
206
|
-
If the dtype is ``float``, the absorption coefficient is identical for all walls and
|
|
207
|
-
all frequencies.
|
|
208
|
-
If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, where the values represent
|
|
209
|
-
absorption coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``,
|
|
210
|
-
and ``"ceiling"``, respectively.
|
|
211
|
-
If ``absorption`` is a 2D Tensor, the shape must be `(7, 6)`, where 7 represents the number of octave bands.
|
|
212
|
-
output_length (int or None, optional): The output length of simulated RIR signal. If ``None``,
|
|
213
|
-
the length is defined as
|
|
214
|
-
|
|
215
|
-
.. math::
|
|
216
|
-
\frac{\text{max\_d} \cdot \text{sample\_rate}}{\text{sound\_speed}} + \text{delay\_filter\_length}
|
|
217
|
-
|
|
218
|
-
where ``max_d`` is the maximum distance between image sources and microphones.
|
|
219
|
-
delay_filter_length (int, optional): The filter length for computing sinc function. (Default: ``81``)
|
|
220
|
-
center_frequency (torch.Tensor, optional): The center frequencies of octave bands for multi-band walls.
|
|
221
|
-
Only used when ``absorption`` is a 2D Tensor.
|
|
222
|
-
sound_speed (float, optional): The speed of sound. (Default: ``343.0``)
|
|
223
|
-
sample_rate (float, optional): The sample rate of the generated room impulse response signal.
|
|
224
|
-
(Default: ``16000.0``)
|
|
225
|
-
|
|
226
|
-
Returns:
|
|
227
|
-
(torch.Tensor): The simulated room impulse response waveform. Tensor with dimensions
|
|
228
|
-
`(channel, rir_length)`.
|
|
229
|
-
|
|
230
|
-
Note:
|
|
231
|
-
If ``absorption`` is a 2D Tensor and ``center_frequency`` is set to ``None``, the center frequencies
|
|
232
|
-
of octave bands are fixed to ``[125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0]``.
|
|
233
|
-
Users need to tune the values of ``absorption`` to the corresponding frequencies.
|
|
234
|
-
"""
|
|
235
|
-
_validate_inputs(room, source, mic_array)
|
|
236
|
-
absorption = _adjust_coeff(absorption, "absorption")
|
|
237
|
-
img_location, att = _compute_image_sources(room, source, max_order, absorption)
|
|
238
|
-
|
|
239
|
-
# compute distances between image sources and microphones
|
|
240
|
-
vec = img_location[:, None, :] - mic_array[None, :, :]
|
|
241
|
-
dist = torch.linalg.norm(vec, dim=-1) # (image_source, channel)
|
|
242
|
-
|
|
243
|
-
img_src_att = att[..., None] / dist[None, ...] # (band, image_source, channel)
|
|
244
|
-
|
|
245
|
-
# separate delays in integer / frac part
|
|
246
|
-
delay = dist * sample_rate / sound_speed # distance to delay in samples
|
|
247
|
-
delay_i = torch.ceil(delay) # integer part
|
|
248
|
-
|
|
249
|
-
# compute the shorts IRs corresponding to each image source
|
|
250
|
-
irs = img_src_att[..., None] * _frac_delay(delay, delay_i, delay_filter_length)[None, ...]
|
|
251
|
-
|
|
252
|
-
rir_length = int(delay_i.max() + irs.shape[-1])
|
|
253
|
-
rir = torch.ops.torchaudio._simulate_rir(irs, delay_i.type(torch.int32), rir_length)
|
|
254
|
-
|
|
255
|
-
# multi-band processing
|
|
256
|
-
if absorption.shape[0] > 1:
|
|
257
|
-
if center_frequency is None:
|
|
258
|
-
center = torch.tensor(
|
|
259
|
-
[125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0], dtype=room.dtype, device=room.device
|
|
260
|
-
)
|
|
261
|
-
else:
|
|
262
|
-
center = center_frequency
|
|
263
|
-
# n_fft is set to 512 by default.
|
|
264
|
-
filters = torch.ops.torchaudio._make_rir_filter(center, sample_rate, n_fft=512)
|
|
265
|
-
rir = torchaudio.functional.fftconvolve(rir, filters.unsqueeze(1).repeat(1, rir.shape[1], 1), mode="same")
|
|
266
|
-
|
|
267
|
-
# sum up rir signals of all image sources into one waveform.
|
|
268
|
-
rir = rir.sum(0)
|
|
269
|
-
|
|
270
|
-
if output_length is not None:
|
|
271
|
-
if output_length > rir.shape[-1]:
|
|
272
|
-
rir = torch.nn.functional.pad(rir, (0, output_length - rir.shape[-1]), "constant", 0.0)
|
|
273
|
-
else:
|
|
274
|
-
rir = rir[..., :output_length]
|
|
275
|
-
|
|
276
|
-
return rir
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
def ray_tracing(
|
|
280
|
-
room: torch.Tensor,
|
|
281
|
-
source: torch.Tensor,
|
|
282
|
-
mic_array: torch.Tensor,
|
|
283
|
-
num_rays: int,
|
|
284
|
-
absorption: Union[float, torch.Tensor] = 0.0,
|
|
285
|
-
scattering: Union[float, torch.Tensor] = 0.0,
|
|
286
|
-
mic_radius: float = 0.5,
|
|
287
|
-
sound_speed: float = 343.0,
|
|
288
|
-
energy_thres: float = 1e-7,
|
|
289
|
-
time_thres: float = 10.0,
|
|
290
|
-
hist_bin_size: float = 0.004,
|
|
291
|
-
) -> torch.Tensor:
|
|
292
|
-
r"""Compute energy histogram via ray tracing.
|
|
293
|
-
|
|
294
|
-
The implementation is based on *pyroomacoustics* :cite:`scheibler2018pyroomacoustics`.
|
|
295
|
-
|
|
296
|
-
``num_rays`` rays are casted uniformly in all directions from the source;
|
|
297
|
-
when a ray intersects a wall, it is reflected and part of its energy is absorbed.
|
|
298
|
-
It is also scattered (sent directly to the microphone(s)) according to the ``scattering``
|
|
299
|
-
coefficient.
|
|
300
|
-
When a ray is close to the microphone, its current energy is recorded in the output
|
|
301
|
-
histogram for that given time slot.
|
|
302
|
-
|
|
303
|
-
.. devices:: CPU
|
|
304
|
-
|
|
305
|
-
.. properties:: TorchScript
|
|
306
|
-
|
|
307
|
-
Args:
|
|
308
|
-
room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents
|
|
309
|
-
three dimensions of the room.
|
|
310
|
-
source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`.
|
|
311
|
-
mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`.
|
|
312
|
-
absorption (float or torch.Tensor, optional): The absorption coefficients of wall materials.
|
|
313
|
-
(Default: ``0.0``).
|
|
314
|
-
If the type is ``float``, the absorption coefficient is identical to all walls and
|
|
315
|
-
all frequencies.
|
|
316
|
-
If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, representing absorption
|
|
317
|
-
coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and
|
|
318
|
-
``"ceiling"``, respectively.
|
|
319
|
-
If ``absorption`` is a 2D Tensor, the shape must be `(num_bands, 6)`.
|
|
320
|
-
``num_bands`` is the number of frequency bands (usually 7).
|
|
321
|
-
scattering(float or torch.Tensor, optional): The scattering coefficients of wall materials. (Default: ``0.0``)
|
|
322
|
-
The shape and type of this parameter is the same as for ``absorption``.
|
|
323
|
-
mic_radius(float, optional): The radius of the microphone in meters. (Default: 0.5)
|
|
324
|
-
sound_speed (float, optional): The speed of sound in meters per second. (Default: ``343.0``)
|
|
325
|
-
energy_thres (float, optional): The energy level below which we stop tracing a ray. (Default: ``1e-7``)
|
|
326
|
-
The initial energy of each ray is ``2 / num_rays``.
|
|
327
|
-
time_thres (float, optional): The maximal duration for which rays are traced. (Unit: seconds) (Default: 10.0)
|
|
328
|
-
hist_bin_size (float, optional): The size of each bin in the output histogram. (Unit: seconds) (Default: 0.004)
|
|
329
|
-
|
|
330
|
-
Returns:
|
|
331
|
-
(torch.Tensor): The 3D histogram(s) where the energy of the traced ray is recorded.
|
|
332
|
-
Each bin corresponds to a given time slot.
|
|
333
|
-
The shape is `(channel, num_bands, num_bins)`, where
|
|
334
|
-
``num_bins = ceil(time_thres / hist_bin_size)``.
|
|
335
|
-
If both ``absorption`` and ``scattering`` are floats, then ``num_bands == 1``.
|
|
336
|
-
"""
|
|
337
|
-
if time_thres < hist_bin_size:
|
|
338
|
-
raise ValueError(
|
|
339
|
-
"`time_thres` must be greater than `hist_bin_size`. "
|
|
340
|
-
f"Found: hist_bin_size={hist_bin_size}, time_thres={time_thres}."
|
|
341
|
-
)
|
|
342
|
-
|
|
343
|
-
if room.dtype != source.dtype or source.dtype != mic_array.dtype:
|
|
344
|
-
raise ValueError(
|
|
345
|
-
"dtype of `room`, `source` and `mic_array` must match. "
|
|
346
|
-
f"Found: `room` ({room.dtype}), `source` ({source.dtype}) and "
|
|
347
|
-
f"`mic_array` ({mic_array.dtype})"
|
|
348
|
-
)
|
|
349
|
-
|
|
350
|
-
_validate_inputs(room, source, mic_array)
|
|
351
|
-
absorption = _adjust_coeff(absorption, "absorption").to(room.dtype)
|
|
352
|
-
scattering = _adjust_coeff(scattering, "scattering").to(room.dtype)
|
|
353
|
-
|
|
354
|
-
# Bring absorption and scattering to the same shape
|
|
355
|
-
if absorption.shape[0] == 1 and scattering.shape[0] > 1:
|
|
356
|
-
absorption = absorption.expand(scattering.shape)
|
|
357
|
-
if scattering.shape[0] == 1 and absorption.shape[0] > 1:
|
|
358
|
-
scattering = scattering.expand(absorption.shape)
|
|
359
|
-
if absorption.shape != scattering.shape:
|
|
360
|
-
raise ValueError(
|
|
361
|
-
"`absorption` and `scattering` must be broadcastable to the same number of bands and walls. "
|
|
362
|
-
f"Inferred shapes absorption={absorption.shape} and scattering={scattering.shape}"
|
|
363
|
-
)
|
|
364
|
-
|
|
365
|
-
histograms = torch.ops.torchaudio.ray_tracing(
|
|
366
|
-
room,
|
|
367
|
-
source,
|
|
368
|
-
mic_array,
|
|
369
|
-
num_rays,
|
|
370
|
-
absorption,
|
|
371
|
-
scattering,
|
|
372
|
-
mic_radius,
|
|
373
|
-
sound_speed,
|
|
374
|
-
energy_thres,
|
|
375
|
-
time_thres,
|
|
376
|
-
hist_bin_size,
|
|
377
|
-
)
|
|
378
|
-
|
|
379
|
-
return histograms
|
|
@@ -1,190 +0,0 @@
|
|
|
1
|
-
import math
|
|
2
|
-
import warnings
|
|
3
|
-
from typing import Optional
|
|
4
|
-
|
|
5
|
-
import torch
|
|
6
|
-
from torchaudio.functional.functional import _create_triangular_filterbank
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def _hz_to_bark(freqs: float, bark_scale: str = "traunmuller") -> float:
|
|
10
|
-
r"""Convert Hz to Barks.
|
|
11
|
-
|
|
12
|
-
Args:
|
|
13
|
-
freqs (float): Frequencies in Hz
|
|
14
|
-
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
|
|
15
|
-
|
|
16
|
-
Returns:
|
|
17
|
-
barks (float): Frequency in Barks
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
if bark_scale not in ["schroeder", "traunmuller", "wang"]:
|
|
21
|
-
raise ValueError('bark_scale should be one of "schroeder", "traunmuller" or "wang".')
|
|
22
|
-
|
|
23
|
-
if bark_scale == "wang":
|
|
24
|
-
return 6.0 * math.asinh(freqs / 600.0)
|
|
25
|
-
elif bark_scale == "schroeder":
|
|
26
|
-
return 7.0 * math.asinh(freqs / 650.0)
|
|
27
|
-
# Traunmuller Bark scale
|
|
28
|
-
barks = ((26.81 * freqs) / (1960.0 + freqs)) - 0.53
|
|
29
|
-
# Bark value correction
|
|
30
|
-
if barks < 2:
|
|
31
|
-
barks += 0.15 * (2 - barks)
|
|
32
|
-
elif barks > 20.1:
|
|
33
|
-
barks += 0.22 * (barks - 20.1)
|
|
34
|
-
|
|
35
|
-
return barks
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def _bark_to_hz(barks: torch.Tensor, bark_scale: str = "traunmuller") -> torch.Tensor:
|
|
39
|
-
"""Convert bark bin numbers to frequencies.
|
|
40
|
-
|
|
41
|
-
Args:
|
|
42
|
-
barks (torch.Tensor): Bark frequencies
|
|
43
|
-
bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
|
|
44
|
-
|
|
45
|
-
Returns:
|
|
46
|
-
freqs (torch.Tensor): Barks converted in Hz
|
|
47
|
-
"""
|
|
48
|
-
|
|
49
|
-
if bark_scale not in ["schroeder", "traunmuller", "wang"]:
|
|
50
|
-
raise ValueError('bark_scale should be one of "traunmuller", "schroeder" or "wang".')
|
|
51
|
-
|
|
52
|
-
if bark_scale == "wang":
|
|
53
|
-
return 600.0 * torch.sinh(barks / 6.0)
|
|
54
|
-
elif bark_scale == "schroeder":
|
|
55
|
-
return 650.0 * torch.sinh(barks / 7.0)
|
|
56
|
-
# Bark value correction
|
|
57
|
-
if any(barks < 2):
|
|
58
|
-
idx = barks < 2
|
|
59
|
-
barks[idx] = (barks[idx] - 0.3) / 0.85
|
|
60
|
-
elif any(barks > 20.1):
|
|
61
|
-
idx = barks > 20.1
|
|
62
|
-
barks[idx] = (barks[idx] + 4.422) / 1.22
|
|
63
|
-
|
|
64
|
-
# Traunmuller Bark scale
|
|
65
|
-
freqs = 1960 * ((barks + 0.53) / (26.28 - barks))
|
|
66
|
-
|
|
67
|
-
return freqs
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def _hz_to_octs(freqs, tuning=0.0, bins_per_octave=12):
|
|
71
|
-
a440 = 440.0 * 2.0 ** (tuning / bins_per_octave)
|
|
72
|
-
return torch.log2(freqs / (a440 / 16))
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
def barkscale_fbanks(
|
|
76
|
-
n_freqs: int,
|
|
77
|
-
f_min: float,
|
|
78
|
-
f_max: float,
|
|
79
|
-
n_barks: int,
|
|
80
|
-
sample_rate: int,
|
|
81
|
-
bark_scale: str = "traunmuller",
|
|
82
|
-
) -> torch.Tensor:
|
|
83
|
-
r"""Create a frequency bin conversion matrix.
|
|
84
|
-
|
|
85
|
-
.. devices:: CPU
|
|
86
|
-
|
|
87
|
-
.. properties:: TorchScript
|
|
88
|
-
|
|
89
|
-
.. image:: https://download.pytorch.org/torchaudio/doc-assets/bark_fbanks.png
|
|
90
|
-
:alt: Visualization of generated filter bank
|
|
91
|
-
|
|
92
|
-
Args:
|
|
93
|
-
n_freqs (int): Number of frequencies to highlight/apply
|
|
94
|
-
f_min (float): Minimum frequency (Hz)
|
|
95
|
-
f_max (float): Maximum frequency (Hz)
|
|
96
|
-
n_barks (int): Number of mel filterbanks
|
|
97
|
-
sample_rate (int): Sample rate of the audio waveform
|
|
98
|
-
bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
|
|
99
|
-
|
|
100
|
-
Returns:
|
|
101
|
-
torch.Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_barks``)
|
|
102
|
-
meaning number of frequencies to highlight/apply to x the number of filterbanks.
|
|
103
|
-
Each column is a filterbank so that assuming there is a matrix A of
|
|
104
|
-
size (..., ``n_freqs``), the applied result would be
|
|
105
|
-
``A * barkscale_fbanks(A.size(-1), ...)``.
|
|
106
|
-
|
|
107
|
-
"""
|
|
108
|
-
|
|
109
|
-
# freq bins
|
|
110
|
-
all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
|
|
111
|
-
|
|
112
|
-
# calculate bark freq bins
|
|
113
|
-
m_min = _hz_to_bark(f_min, bark_scale=bark_scale)
|
|
114
|
-
m_max = _hz_to_bark(f_max, bark_scale=bark_scale)
|
|
115
|
-
|
|
116
|
-
m_pts = torch.linspace(m_min, m_max, n_barks + 2)
|
|
117
|
-
f_pts = _bark_to_hz(m_pts, bark_scale=bark_scale)
|
|
118
|
-
|
|
119
|
-
# create filterbank
|
|
120
|
-
fb = _create_triangular_filterbank(all_freqs, f_pts)
|
|
121
|
-
|
|
122
|
-
if (fb.max(dim=0).values == 0.0).any():
|
|
123
|
-
warnings.warn(
|
|
124
|
-
"At least one bark filterbank has all zero values. "
|
|
125
|
-
f"The value for `n_barks` ({n_barks}) may be set too high. "
|
|
126
|
-
f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
|
|
127
|
-
)
|
|
128
|
-
|
|
129
|
-
return fb
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
def chroma_filterbank(
|
|
133
|
-
sample_rate: int,
|
|
134
|
-
n_freqs: int,
|
|
135
|
-
n_chroma: int,
|
|
136
|
-
*,
|
|
137
|
-
tuning: float = 0.0,
|
|
138
|
-
ctroct: float = 5.0,
|
|
139
|
-
octwidth: Optional[float] = 2.0,
|
|
140
|
-
norm: int = 2,
|
|
141
|
-
base_c: bool = True,
|
|
142
|
-
):
|
|
143
|
-
"""Create a frequency-to-chroma conversion matrix. Implementation adapted from librosa.
|
|
144
|
-
|
|
145
|
-
Args:
|
|
146
|
-
sample_rate (int): Sample rate.
|
|
147
|
-
n_freqs (int): Number of input frequencies.
|
|
148
|
-
n_chroma (int): Number of output chroma.
|
|
149
|
-
tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
|
|
150
|
-
ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
|
|
151
|
-
octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
|
|
152
|
-
If ``None``, then disable weighting altogether. (Default: 2.0)
|
|
153
|
-
norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
|
|
154
|
-
base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
|
|
155
|
-
|
|
156
|
-
Returns:
|
|
157
|
-
torch.Tensor: Chroma filter bank, with shape `(n_freqs, n_chroma)`.
|
|
158
|
-
"""
|
|
159
|
-
# Skip redundant upper half of frequency range.
|
|
160
|
-
freqs = torch.linspace(0, sample_rate // 2, n_freqs)[1:]
|
|
161
|
-
freq_bins = n_chroma * _hz_to_octs(freqs, bins_per_octave=n_chroma, tuning=tuning)
|
|
162
|
-
freq_bins = torch.cat((torch.tensor([freq_bins[0] - 1.5 * n_chroma]), freq_bins))
|
|
163
|
-
freq_bin_widths = torch.cat(
|
|
164
|
-
(
|
|
165
|
-
torch.maximum(freq_bins[1:] - freq_bins[:-1], torch.tensor(1.0)),
|
|
166
|
-
torch.tensor([1]),
|
|
167
|
-
)
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
# (n_freqs, n_chroma)
|
|
171
|
-
D = freq_bins.unsqueeze(1) - torch.arange(0, n_chroma)
|
|
172
|
-
|
|
173
|
-
n_chroma2 = round(n_chroma / 2)
|
|
174
|
-
|
|
175
|
-
# Project to range [-n_chroma/2, n_chroma/2 - 1]
|
|
176
|
-
D = torch.remainder(D + n_chroma2, n_chroma) - n_chroma2
|
|
177
|
-
|
|
178
|
-
fb = torch.exp(-0.5 * (2 * D / torch.tile(freq_bin_widths.unsqueeze(1), (1, n_chroma))) ** 2)
|
|
179
|
-
fb = torch.nn.functional.normalize(fb, p=norm, dim=1)
|
|
180
|
-
|
|
181
|
-
if octwidth is not None:
|
|
182
|
-
fb *= torch.tile(
|
|
183
|
-
torch.exp(-0.5 * (((freq_bins.unsqueeze(1) / n_chroma - ctroct) / octwidth) ** 2)),
|
|
184
|
-
(1, n_chroma),
|
|
185
|
-
)
|
|
186
|
-
|
|
187
|
-
if base_c:
|
|
188
|
-
fb = torch.roll(fb, -3 * (n_chroma // 12), dims=1)
|
|
189
|
-
|
|
190
|
-
return fb
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
from ._conformer_wav2vec2 import (
|
|
2
|
-
conformer_wav2vec2_base,
|
|
3
|
-
conformer_wav2vec2_model,
|
|
4
|
-
conformer_wav2vec2_pretrain_base,
|
|
5
|
-
conformer_wav2vec2_pretrain_large,
|
|
6
|
-
conformer_wav2vec2_pretrain_model,
|
|
7
|
-
ConformerWav2Vec2PretrainModel,
|
|
8
|
-
)
|
|
9
|
-
from ._emformer_hubert import emformer_hubert_base, emformer_hubert_model
|
|
10
|
-
from .conv_emformer import ConvEmformer
|
|
11
|
-
from .hifi_gan import hifigan_vocoder, hifigan_vocoder_v1, hifigan_vocoder_v2, hifigan_vocoder_v3, HiFiGANVocoder
|
|
12
|
-
from .rnnt import conformer_rnnt_base, conformer_rnnt_biasing, conformer_rnnt_biasing_base, conformer_rnnt_model
|
|
13
|
-
from .rnnt_decoder import Hypothesis, RNNTBeamSearchBiasing
|
|
14
|
-
|
|
15
|
-
__all__ = [
|
|
16
|
-
"conformer_rnnt_base",
|
|
17
|
-
"conformer_rnnt_model",
|
|
18
|
-
"conformer_rnnt_biasing",
|
|
19
|
-
"conformer_rnnt_biasing_base",
|
|
20
|
-
"ConvEmformer",
|
|
21
|
-
"conformer_wav2vec2_model",
|
|
22
|
-
"conformer_wav2vec2_base",
|
|
23
|
-
"conformer_wav2vec2_pretrain_model",
|
|
24
|
-
"conformer_wav2vec2_pretrain_base",
|
|
25
|
-
"conformer_wav2vec2_pretrain_large",
|
|
26
|
-
"ConformerWav2Vec2PretrainModel",
|
|
27
|
-
"emformer_hubert_base",
|
|
28
|
-
"emformer_hubert_model",
|
|
29
|
-
"Hypothesis",
|
|
30
|
-
"RNNTBeamSearchBiasing",
|
|
31
|
-
"HiFiGANVocoder",
|
|
32
|
-
"hifigan_vocoder_v1",
|
|
33
|
-
"hifigan_vocoder_v2",
|
|
34
|
-
"hifigan_vocoder_v3",
|
|
35
|
-
"hifigan_vocoder",
|
|
36
|
-
]
|