torchaudio 2.7.1__cp311-cp311-win_amd64.whl → 2.9.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show
  1. torchaudio/__init__.py +184 -33
  2. torchaudio/_extension/__init__.py +1 -14
  3. torchaudio/_extension/utils.py +0 -47
  4. torchaudio/_internal/module_utils.py +68 -10
  5. torchaudio/_torchcodec.py +340 -0
  6. torchaudio/datasets/cmuarctic.py +1 -1
  7. torchaudio/datasets/utils.py +1 -1
  8. torchaudio/functional/__init__.py +6 -3
  9. torchaudio/functional/_alignment.py +1 -1
  10. torchaudio/functional/filtering.py +70 -55
  11. torchaudio/functional/functional.py +31 -61
  12. torchaudio/lib/_torchaudio.pyd +0 -0
  13. torchaudio/lib/libtorchaudio.pyd +0 -0
  14. torchaudio/models/decoder/__init__.py +19 -1
  15. torchaudio/models/decoder/_ctc_decoder.py +6 -6
  16. torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
  17. torchaudio/models/squim/objective.py +2 -2
  18. torchaudio/pipelines/_source_separation_pipeline.py +1 -1
  19. torchaudio/pipelines/_squim_pipeline.py +2 -2
  20. torchaudio/pipelines/_tts/utils.py +3 -1
  21. torchaudio/pipelines/rnnt_pipeline.py +4 -4
  22. torchaudio/transforms/__init__.py +4 -1
  23. torchaudio/transforms/_transforms.py +4 -3
  24. torchaudio/utils/__init__.py +2 -9
  25. torchaudio/utils/download.py +1 -1
  26. torchaudio/version.py +2 -2
  27. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/METADATA +15 -7
  28. torchaudio-2.9.0.dist-info/RECORD +85 -0
  29. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
  30. torchaudio/_backend/__init__.py +0 -61
  31. torchaudio/_backend/backend.py +0 -53
  32. torchaudio/_backend/common.py +0 -52
  33. torchaudio/_backend/ffmpeg.py +0 -334
  34. torchaudio/_backend/soundfile.py +0 -54
  35. torchaudio/_backend/soundfile_backend.py +0 -457
  36. torchaudio/_backend/sox.py +0 -91
  37. torchaudio/_backend/utils.py +0 -317
  38. torchaudio/backend/__init__.py +0 -8
  39. torchaudio/backend/_no_backend.py +0 -25
  40. torchaudio/backend/_sox_io_backend.py +0 -294
  41. torchaudio/backend/common.py +0 -13
  42. torchaudio/backend/no_backend.py +0 -14
  43. torchaudio/backend/soundfile_backend.py +0 -14
  44. torchaudio/backend/sox_io_backend.py +0 -14
  45. torchaudio/io/__init__.py +0 -13
  46. torchaudio/io/_effector.py +0 -347
  47. torchaudio/io/_playback.py +0 -72
  48. torchaudio/kaldi_io.py +0 -144
  49. torchaudio/prototype/__init__.py +0 -0
  50. torchaudio/prototype/datasets/__init__.py +0 -4
  51. torchaudio/prototype/datasets/musan.py +0 -67
  52. torchaudio/prototype/functional/__init__.py +0 -26
  53. torchaudio/prototype/functional/_dsp.py +0 -433
  54. torchaudio/prototype/functional/_rir.py +0 -379
  55. torchaudio/prototype/functional/functional.py +0 -190
  56. torchaudio/prototype/models/__init__.py +0 -36
  57. torchaudio/prototype/models/_conformer_wav2vec2.py +0 -794
  58. torchaudio/prototype/models/_emformer_hubert.py +0 -333
  59. torchaudio/prototype/models/conv_emformer.py +0 -525
  60. torchaudio/prototype/models/hifi_gan.py +0 -336
  61. torchaudio/prototype/models/rnnt.py +0 -711
  62. torchaudio/prototype/models/rnnt_decoder.py +0 -399
  63. torchaudio/prototype/pipelines/__init__.py +0 -12
  64. torchaudio/prototype/pipelines/_vggish/__init__.py +0 -3
  65. torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -233
  66. torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -82
  67. torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -228
  68. torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
  69. torchaudio/prototype/transforms/__init__.py +0 -9
  70. torchaudio/prototype/transforms/_transforms.py +0 -456
  71. torchaudio/sox_effects/__init__.py +0 -10
  72. torchaudio/sox_effects/sox_effects.py +0 -272
  73. torchaudio/utils/ffmpeg_utils.py +0 -11
  74. torchaudio/utils/sox_utils.py +0 -99
  75. torchaudio-2.7.1.dist-info/RECORD +0 -144
  76. torio/__init__.py +0 -8
  77. torio/_extension/__init__.py +0 -13
  78. torio/_extension/utils.py +0 -147
  79. torio/io/__init__.py +0 -9
  80. torio/io/_streaming_media_decoder.py +0 -978
  81. torio/io/_streaming_media_encoder.py +0 -502
  82. torio/lib/__init__.py +0 -0
  83. torio/lib/_torio_ffmpeg4.pyd +0 -0
  84. torio/lib/_torio_ffmpeg5.pyd +0 -0
  85. torio/lib/_torio_ffmpeg6.pyd +0 -0
  86. torio/lib/libtorio_ffmpeg4.pyd +0 -0
  87. torio/lib/libtorio_ffmpeg5.pyd +0 -0
  88. torio/lib/libtorio_ffmpeg6.pyd +0 -0
  89. torio/utils/__init__.py +0 -4
  90. torio/utils/ffmpeg_utils.py +0 -247
  91. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
  92. {torchaudio-2.7.1.dist-info → torchaudio-2.9.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,379 +0,0 @@
1
- import math
2
- from typing import Optional, Tuple, Union
3
-
4
- import torch
5
- import torchaudio
6
- from torch import Tensor
7
-
8
-
9
- def _compute_image_sources(
10
- room: torch.Tensor,
11
- source: torch.Tensor,
12
- max_order: int,
13
- absorption: torch.Tensor,
14
- scatter: Optional[torch.Tensor] = None,
15
- ) -> Tuple[Tensor, Tensor]:
16
- """Compute image sources in a shoebox-like room.
17
-
18
- Args:
19
- room (torch.Tensor): The 1D Tensor to determine the room size. The shape is
20
- `(D,)`, where ``D`` is 2 if room is a 2D room, or 3 if room is a 3D room.
21
- source (torch.Tensor): The coordinate of the sound source. Tensor with dimensions
22
- `(D)`.
23
- max_order (int): The maximum number of reflections of the source.
24
- absorption (torch.Tensor): The absorption coefficients of wall materials.
25
- ``absorption`` is a Tensor with dimensions `(num_band, num_wall)`.
26
- The shape options are ``[(1, 4), (1, 6), (7, 4), (7, 6)]``.
27
- ``num_band`` is `1` if the coefficients is the same for all frequencies, or is `7`
28
- if the coefficients are different to different frequencies. `7` refers to the default number
29
- of octave bands. (See note in `simulate_rir_ism` method).
30
- ``num_wall`` is `4` if the room is a 2D room, representing absorption coefficients
31
- of ``"west"``, ``"east"``, ``"south"``, and ``"north"`` walls, respectively.
32
- Or it is `6` if the room is a 3D room, representing absorption coefficients
33
- of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively.
34
- scatter (torch.Tensor): The scattering coefficients of wall materials.
35
- The shape of ``scatter`` must match that of ``absorption``. If ``None``, it is not
36
- used in image source computation. (Default: ``None``)
37
-
38
- Returns:
39
- (torch.Tensor): The coordinates of all image sources within ``max_order`` number of reflections.
40
- Tensor with dimensions `(num_image_source, D)`.
41
- (torch.Tensor): The attenuation of corresponding image sources. Tensor with dimensions
42
- `(num_band, num_image_source)`.
43
- """
44
- if scatter is None:
45
- tr = torch.sqrt(1 - absorption)
46
- else:
47
- tr = torch.sqrt(1 - absorption) * torch.sqrt(1 - scatter)
48
-
49
- ind = torch.arange(-max_order, max_order + 1, device=source.device)
50
- if room.shape[0] == 2:
51
- XYZ = torch.meshgrid(ind, ind, indexing="ij")
52
- else:
53
- XYZ = torch.meshgrid(ind, ind, ind, indexing="ij")
54
- XYZ = torch.stack([c.reshape((-1,)) for c in XYZ], dim=-1)
55
- XYZ = XYZ[XYZ.abs().sum(dim=-1) <= max_order]
56
-
57
- # compute locations of image sources
58
- d = room[None, :]
59
- s = source[None, :]
60
- img_loc = torch.where(XYZ % 2 == 1, d * (XYZ + 1) - s, d * XYZ + s)
61
-
62
- # attenuation
63
- exp_lo = abs(torch.floor((XYZ / 2)))
64
- exp_hi = abs(torch.floor((XYZ + 1) / 2))
65
- t_lo = tr[:, ::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1) # (num_band, left walls)
66
- t_hi = tr[:, 1::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1) # (num_band, right walls)
67
- att = torch.prod((t_lo**exp_lo) * (t_hi**exp_hi), dim=-1) # (num_band, num_image_source)
68
- return img_loc, att
69
-
70
-
71
- def _hann(x: torch.Tensor, T: int):
72
- """Compute the Hann window where the values are truncated based on window length.
73
- torch.hann_window can only sample window function at integer points, the method is to sample
74
- continuous window function at non-integer points.
75
-
76
- Args:
77
- x (torch.Tensor): The fractional component of time delay Tensor.
78
- T (torch.Tensor): The window length of sinc function.
79
-
80
- Returns:
81
- (torch.Tensor): The hann window Tensor where values outside
82
- the sinc window (`T`) is set to zero.
83
- """
84
- y = torch.where(
85
- torch.abs(x) <= T / 2,
86
- 0.5 * (1 + torch.cos(2 * math.pi * x / T)),
87
- x.new_zeros(1),
88
- )
89
- return y
90
-
91
-
92
- def _frac_delay(delay: torch.Tensor, delay_i: torch.Tensor, delay_filter_length: int):
93
- """Compute fractional delay of impulse response signal.
94
-
95
- Args:
96
- delay (torch.Tensor): The time delay Tensor in samples.
97
- delay_i (torch.Tensor): The integer part of delay.
98
- delay_filter_length (int): The window length for sinc function.
99
-
100
- Returns:
101
- (torch.Tensor): The impulse response Tensor for all image sources.
102
- """
103
- if delay_filter_length % 2 != 1:
104
- raise ValueError("The filter length must be odd")
105
-
106
- pad = delay_filter_length // 2
107
- n = torch.arange(-pad, pad + 1, device=delay.device) + delay_i[..., None]
108
- delay = delay[..., None]
109
-
110
- return torch.special.sinc(n - delay) * _hann(n - delay, 2 * pad)
111
-
112
-
113
- def _adjust_coeff(coeffs: Union[float, torch.Tensor], name: str) -> torch.Tensor:
114
- """Validates and converts absorption or scattering parameters to a tensor with appropriate shape
115
-
116
- Args:
117
- coeff (float or torch.Tensor): The absorption coefficients of wall materials.
118
-
119
- If the dtype is ``float``, the absorption coefficient is identical for all walls and
120
- all frequencies.
121
-
122
- If ``absorption`` is a 1D Tensor, the shape must be `(2*dim,)`,
123
- where the values represent absorption coefficients of ``"west"``, ``"east"``,
124
- ``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively.
125
-
126
- If ``absorption`` is a 2D Tensor, the shape must be `(7, 2*dim)`,
127
- where 7 represents the number of octave bands.
128
-
129
- Returns:
130
- (torch.Tensor): The expanded coefficient.
131
- The shape is `(1, 6)` for single octave band case, and
132
- `(7, 6)` for multi octave band case.
133
- """
134
- num_walls = 6
135
- if isinstance(coeffs, float):
136
- if coeffs < 0:
137
- raise ValueError(f"`{name}` must be non-negative. Found: {coeffs}")
138
- return torch.full((1, num_walls), coeffs)
139
- if isinstance(coeffs, Tensor):
140
- if torch.any(coeffs < 0):
141
- raise ValueError(f"`{name}` must be non-negative. Found: {coeffs}")
142
- if coeffs.ndim == 1:
143
- if coeffs.numel() != num_walls:
144
- raise ValueError(
145
- f"The shape of `{name}` must be ({num_walls},) when it is a 1D Tensor. "
146
- f"Found the shape {coeffs.shape}."
147
- )
148
- return coeffs.unsqueeze(0)
149
- if coeffs.ndim == 2:
150
- if coeffs.shape[1] != num_walls:
151
- raise ValueError(
152
- f"The shape of `{name}` must be (NUM_BANDS, {num_walls}) when it "
153
- f"is a 2D Tensor. Found: {coeffs.shape}."
154
- )
155
- return coeffs
156
- raise TypeError(f"`{name}` must be float or Tensor.")
157
-
158
-
159
- def _validate_inputs(
160
- room: torch.Tensor,
161
- source: torch.Tensor,
162
- mic_array: torch.Tensor,
163
- ):
164
- """Validate dimensions of input arguments, and normalize different kinds of absorption into the same dimension.
165
-
166
- Args:
167
- room (torch.Tensor): The size of the room. width, length (and height)
168
- source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(dim,)`.
169
- mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, dim)`.
170
- """
171
- if not (room.ndim == 1 and room.numel() == 3):
172
- raise ValueError(f"`room` must be a 1D Tensor with 3 elements. Found {room.shape}.")
173
- if not (source.ndim == 1 and source.numel() == 3):
174
- raise ValueError(f"`source` must be 1D Tensor with 3 elements. Found {source.shape}.")
175
- if not (mic_array.ndim == 2 and mic_array.shape[1] == 3):
176
- raise ValueError(f"`mic_array` must be a 2D Tensor with shape (num_channels, 3). Found {mic_array.shape}.")
177
-
178
-
179
- def simulate_rir_ism(
180
- room: torch.Tensor,
181
- source: torch.Tensor,
182
- mic_array: torch.Tensor,
183
- max_order: int,
184
- absorption: Union[float, torch.Tensor],
185
- output_length: Optional[int] = None,
186
- delay_filter_length: int = 81,
187
- center_frequency: Optional[torch.Tensor] = None,
188
- sound_speed: float = 343.0,
189
- sample_rate: float = 16000.0,
190
- ) -> Tensor:
191
- r"""Compute Room Impulse Response (RIR) based on the *image source method* :cite:`allen1979image`.
192
- The implementation is based on *pyroomacoustics* :cite:`scheibler2018pyroomacoustics`.
193
-
194
- .. devices:: CPU
195
-
196
- .. properties:: TorchScript
197
-
198
- Args:
199
- room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents
200
- three dimensions of the room.
201
- source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`.
202
- mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`.
203
- max_order (int): The maximum number of reflections of the source.
204
- absorption (float or torch.Tensor): The *absorption* :cite:`wiki:Absorption_(acoustics)`
205
- coefficients of wall materials for sound energy.
206
- If the dtype is ``float``, the absorption coefficient is identical for all walls and
207
- all frequencies.
208
- If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, where the values represent
209
- absorption coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``,
210
- and ``"ceiling"``, respectively.
211
- If ``absorption`` is a 2D Tensor, the shape must be `(7, 6)`, where 7 represents the number of octave bands.
212
- output_length (int or None, optional): The output length of simulated RIR signal. If ``None``,
213
- the length is defined as
214
-
215
- .. math::
216
- \frac{\text{max\_d} \cdot \text{sample\_rate}}{\text{sound\_speed}} + \text{delay\_filter\_length}
217
-
218
- where ``max_d`` is the maximum distance between image sources and microphones.
219
- delay_filter_length (int, optional): The filter length for computing sinc function. (Default: ``81``)
220
- center_frequency (torch.Tensor, optional): The center frequencies of octave bands for multi-band walls.
221
- Only used when ``absorption`` is a 2D Tensor.
222
- sound_speed (float, optional): The speed of sound. (Default: ``343.0``)
223
- sample_rate (float, optional): The sample rate of the generated room impulse response signal.
224
- (Default: ``16000.0``)
225
-
226
- Returns:
227
- (torch.Tensor): The simulated room impulse response waveform. Tensor with dimensions
228
- `(channel, rir_length)`.
229
-
230
- Note:
231
- If ``absorption`` is a 2D Tensor and ``center_frequency`` is set to ``None``, the center frequencies
232
- of octave bands are fixed to ``[125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0]``.
233
- Users need to tune the values of ``absorption`` to the corresponding frequencies.
234
- """
235
- _validate_inputs(room, source, mic_array)
236
- absorption = _adjust_coeff(absorption, "absorption")
237
- img_location, att = _compute_image_sources(room, source, max_order, absorption)
238
-
239
- # compute distances between image sources and microphones
240
- vec = img_location[:, None, :] - mic_array[None, :, :]
241
- dist = torch.linalg.norm(vec, dim=-1) # (image_source, channel)
242
-
243
- img_src_att = att[..., None] / dist[None, ...] # (band, image_source, channel)
244
-
245
- # separate delays in integer / frac part
246
- delay = dist * sample_rate / sound_speed # distance to delay in samples
247
- delay_i = torch.ceil(delay) # integer part
248
-
249
- # compute the shorts IRs corresponding to each image source
250
- irs = img_src_att[..., None] * _frac_delay(delay, delay_i, delay_filter_length)[None, ...]
251
-
252
- rir_length = int(delay_i.max() + irs.shape[-1])
253
- rir = torch.ops.torchaudio._simulate_rir(irs, delay_i.type(torch.int32), rir_length)
254
-
255
- # multi-band processing
256
- if absorption.shape[0] > 1:
257
- if center_frequency is None:
258
- center = torch.tensor(
259
- [125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0], dtype=room.dtype, device=room.device
260
- )
261
- else:
262
- center = center_frequency
263
- # n_fft is set to 512 by default.
264
- filters = torch.ops.torchaudio._make_rir_filter(center, sample_rate, n_fft=512)
265
- rir = torchaudio.functional.fftconvolve(rir, filters.unsqueeze(1).repeat(1, rir.shape[1], 1), mode="same")
266
-
267
- # sum up rir signals of all image sources into one waveform.
268
- rir = rir.sum(0)
269
-
270
- if output_length is not None:
271
- if output_length > rir.shape[-1]:
272
- rir = torch.nn.functional.pad(rir, (0, output_length - rir.shape[-1]), "constant", 0.0)
273
- else:
274
- rir = rir[..., :output_length]
275
-
276
- return rir
277
-
278
-
279
- def ray_tracing(
280
- room: torch.Tensor,
281
- source: torch.Tensor,
282
- mic_array: torch.Tensor,
283
- num_rays: int,
284
- absorption: Union[float, torch.Tensor] = 0.0,
285
- scattering: Union[float, torch.Tensor] = 0.0,
286
- mic_radius: float = 0.5,
287
- sound_speed: float = 343.0,
288
- energy_thres: float = 1e-7,
289
- time_thres: float = 10.0,
290
- hist_bin_size: float = 0.004,
291
- ) -> torch.Tensor:
292
- r"""Compute energy histogram via ray tracing.
293
-
294
- The implementation is based on *pyroomacoustics* :cite:`scheibler2018pyroomacoustics`.
295
-
296
- ``num_rays`` rays are casted uniformly in all directions from the source;
297
- when a ray intersects a wall, it is reflected and part of its energy is absorbed.
298
- It is also scattered (sent directly to the microphone(s)) according to the ``scattering``
299
- coefficient.
300
- When a ray is close to the microphone, its current energy is recorded in the output
301
- histogram for that given time slot.
302
-
303
- .. devices:: CPU
304
-
305
- .. properties:: TorchScript
306
-
307
- Args:
308
- room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents
309
- three dimensions of the room.
310
- source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`.
311
- mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`.
312
- absorption (float or torch.Tensor, optional): The absorption coefficients of wall materials.
313
- (Default: ``0.0``).
314
- If the type is ``float``, the absorption coefficient is identical to all walls and
315
- all frequencies.
316
- If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, representing absorption
317
- coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and
318
- ``"ceiling"``, respectively.
319
- If ``absorption`` is a 2D Tensor, the shape must be `(num_bands, 6)`.
320
- ``num_bands`` is the number of frequency bands (usually 7).
321
- scattering(float or torch.Tensor, optional): The scattering coefficients of wall materials. (Default: ``0.0``)
322
- The shape and type of this parameter is the same as for ``absorption``.
323
- mic_radius(float, optional): The radius of the microphone in meters. (Default: 0.5)
324
- sound_speed (float, optional): The speed of sound in meters per second. (Default: ``343.0``)
325
- energy_thres (float, optional): The energy level below which we stop tracing a ray. (Default: ``1e-7``)
326
- The initial energy of each ray is ``2 / num_rays``.
327
- time_thres (float, optional): The maximal duration for which rays are traced. (Unit: seconds) (Default: 10.0)
328
- hist_bin_size (float, optional): The size of each bin in the output histogram. (Unit: seconds) (Default: 0.004)
329
-
330
- Returns:
331
- (torch.Tensor): The 3D histogram(s) where the energy of the traced ray is recorded.
332
- Each bin corresponds to a given time slot.
333
- The shape is `(channel, num_bands, num_bins)`, where
334
- ``num_bins = ceil(time_thres / hist_bin_size)``.
335
- If both ``absorption`` and ``scattering`` are floats, then ``num_bands == 1``.
336
- """
337
- if time_thres < hist_bin_size:
338
- raise ValueError(
339
- "`time_thres` must be greater than `hist_bin_size`. "
340
- f"Found: hist_bin_size={hist_bin_size}, time_thres={time_thres}."
341
- )
342
-
343
- if room.dtype != source.dtype or source.dtype != mic_array.dtype:
344
- raise ValueError(
345
- "dtype of `room`, `source` and `mic_array` must match. "
346
- f"Found: `room` ({room.dtype}), `source` ({source.dtype}) and "
347
- f"`mic_array` ({mic_array.dtype})"
348
- )
349
-
350
- _validate_inputs(room, source, mic_array)
351
- absorption = _adjust_coeff(absorption, "absorption").to(room.dtype)
352
- scattering = _adjust_coeff(scattering, "scattering").to(room.dtype)
353
-
354
- # Bring absorption and scattering to the same shape
355
- if absorption.shape[0] == 1 and scattering.shape[0] > 1:
356
- absorption = absorption.expand(scattering.shape)
357
- if scattering.shape[0] == 1 and absorption.shape[0] > 1:
358
- scattering = scattering.expand(absorption.shape)
359
- if absorption.shape != scattering.shape:
360
- raise ValueError(
361
- "`absorption` and `scattering` must be broadcastable to the same number of bands and walls. "
362
- f"Inferred shapes absorption={absorption.shape} and scattering={scattering.shape}"
363
- )
364
-
365
- histograms = torch.ops.torchaudio.ray_tracing(
366
- room,
367
- source,
368
- mic_array,
369
- num_rays,
370
- absorption,
371
- scattering,
372
- mic_radius,
373
- sound_speed,
374
- energy_thres,
375
- time_thres,
376
- hist_bin_size,
377
- )
378
-
379
- return histograms
@@ -1,190 +0,0 @@
1
- import math
2
- import warnings
3
- from typing import Optional
4
-
5
- import torch
6
- from torchaudio.functional.functional import _create_triangular_filterbank
7
-
8
-
9
- def _hz_to_bark(freqs: float, bark_scale: str = "traunmuller") -> float:
10
- r"""Convert Hz to Barks.
11
-
12
- Args:
13
- freqs (float): Frequencies in Hz
14
- bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
15
-
16
- Returns:
17
- barks (float): Frequency in Barks
18
- """
19
-
20
- if bark_scale not in ["schroeder", "traunmuller", "wang"]:
21
- raise ValueError('bark_scale should be one of "schroeder", "traunmuller" or "wang".')
22
-
23
- if bark_scale == "wang":
24
- return 6.0 * math.asinh(freqs / 600.0)
25
- elif bark_scale == "schroeder":
26
- return 7.0 * math.asinh(freqs / 650.0)
27
- # Traunmuller Bark scale
28
- barks = ((26.81 * freqs) / (1960.0 + freqs)) - 0.53
29
- # Bark value correction
30
- if barks < 2:
31
- barks += 0.15 * (2 - barks)
32
- elif barks > 20.1:
33
- barks += 0.22 * (barks - 20.1)
34
-
35
- return barks
36
-
37
-
38
- def _bark_to_hz(barks: torch.Tensor, bark_scale: str = "traunmuller") -> torch.Tensor:
39
- """Convert bark bin numbers to frequencies.
40
-
41
- Args:
42
- barks (torch.Tensor): Bark frequencies
43
- bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
44
-
45
- Returns:
46
- freqs (torch.Tensor): Barks converted in Hz
47
- """
48
-
49
- if bark_scale not in ["schroeder", "traunmuller", "wang"]:
50
- raise ValueError('bark_scale should be one of "traunmuller", "schroeder" or "wang".')
51
-
52
- if bark_scale == "wang":
53
- return 600.0 * torch.sinh(barks / 6.0)
54
- elif bark_scale == "schroeder":
55
- return 650.0 * torch.sinh(barks / 7.0)
56
- # Bark value correction
57
- if any(barks < 2):
58
- idx = barks < 2
59
- barks[idx] = (barks[idx] - 0.3) / 0.85
60
- elif any(barks > 20.1):
61
- idx = barks > 20.1
62
- barks[idx] = (barks[idx] + 4.422) / 1.22
63
-
64
- # Traunmuller Bark scale
65
- freqs = 1960 * ((barks + 0.53) / (26.28 - barks))
66
-
67
- return freqs
68
-
69
-
70
- def _hz_to_octs(freqs, tuning=0.0, bins_per_octave=12):
71
- a440 = 440.0 * 2.0 ** (tuning / bins_per_octave)
72
- return torch.log2(freqs / (a440 / 16))
73
-
74
-
75
- def barkscale_fbanks(
76
- n_freqs: int,
77
- f_min: float,
78
- f_max: float,
79
- n_barks: int,
80
- sample_rate: int,
81
- bark_scale: str = "traunmuller",
82
- ) -> torch.Tensor:
83
- r"""Create a frequency bin conversion matrix.
84
-
85
- .. devices:: CPU
86
-
87
- .. properties:: TorchScript
88
-
89
- .. image:: https://download.pytorch.org/torchaudio/doc-assets/bark_fbanks.png
90
- :alt: Visualization of generated filter bank
91
-
92
- Args:
93
- n_freqs (int): Number of frequencies to highlight/apply
94
- f_min (float): Minimum frequency (Hz)
95
- f_max (float): Maximum frequency (Hz)
96
- n_barks (int): Number of mel filterbanks
97
- sample_rate (int): Sample rate of the audio waveform
98
- bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
99
-
100
- Returns:
101
- torch.Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_barks``)
102
- meaning number of frequencies to highlight/apply to x the number of filterbanks.
103
- Each column is a filterbank so that assuming there is a matrix A of
104
- size (..., ``n_freqs``), the applied result would be
105
- ``A * barkscale_fbanks(A.size(-1), ...)``.
106
-
107
- """
108
-
109
- # freq bins
110
- all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
111
-
112
- # calculate bark freq bins
113
- m_min = _hz_to_bark(f_min, bark_scale=bark_scale)
114
- m_max = _hz_to_bark(f_max, bark_scale=bark_scale)
115
-
116
- m_pts = torch.linspace(m_min, m_max, n_barks + 2)
117
- f_pts = _bark_to_hz(m_pts, bark_scale=bark_scale)
118
-
119
- # create filterbank
120
- fb = _create_triangular_filterbank(all_freqs, f_pts)
121
-
122
- if (fb.max(dim=0).values == 0.0).any():
123
- warnings.warn(
124
- "At least one bark filterbank has all zero values. "
125
- f"The value for `n_barks` ({n_barks}) may be set too high. "
126
- f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
127
- )
128
-
129
- return fb
130
-
131
-
132
- def chroma_filterbank(
133
- sample_rate: int,
134
- n_freqs: int,
135
- n_chroma: int,
136
- *,
137
- tuning: float = 0.0,
138
- ctroct: float = 5.0,
139
- octwidth: Optional[float] = 2.0,
140
- norm: int = 2,
141
- base_c: bool = True,
142
- ):
143
- """Create a frequency-to-chroma conversion matrix. Implementation adapted from librosa.
144
-
145
- Args:
146
- sample_rate (int): Sample rate.
147
- n_freqs (int): Number of input frequencies.
148
- n_chroma (int): Number of output chroma.
149
- tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
150
- ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
151
- octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
152
- If ``None``, then disable weighting altogether. (Default: 2.0)
153
- norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
154
- base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
155
-
156
- Returns:
157
- torch.Tensor: Chroma filter bank, with shape `(n_freqs, n_chroma)`.
158
- """
159
- # Skip redundant upper half of frequency range.
160
- freqs = torch.linspace(0, sample_rate // 2, n_freqs)[1:]
161
- freq_bins = n_chroma * _hz_to_octs(freqs, bins_per_octave=n_chroma, tuning=tuning)
162
- freq_bins = torch.cat((torch.tensor([freq_bins[0] - 1.5 * n_chroma]), freq_bins))
163
- freq_bin_widths = torch.cat(
164
- (
165
- torch.maximum(freq_bins[1:] - freq_bins[:-1], torch.tensor(1.0)),
166
- torch.tensor([1]),
167
- )
168
- )
169
-
170
- # (n_freqs, n_chroma)
171
- D = freq_bins.unsqueeze(1) - torch.arange(0, n_chroma)
172
-
173
- n_chroma2 = round(n_chroma / 2)
174
-
175
- # Project to range [-n_chroma/2, n_chroma/2 - 1]
176
- D = torch.remainder(D + n_chroma2, n_chroma) - n_chroma2
177
-
178
- fb = torch.exp(-0.5 * (2 * D / torch.tile(freq_bin_widths.unsqueeze(1), (1, n_chroma))) ** 2)
179
- fb = torch.nn.functional.normalize(fb, p=norm, dim=1)
180
-
181
- if octwidth is not None:
182
- fb *= torch.tile(
183
- torch.exp(-0.5 * (((freq_bins.unsqueeze(1) / n_chroma - ctroct) / octwidth) ** 2)),
184
- (1, n_chroma),
185
- )
186
-
187
- if base_c:
188
- fb = torch.roll(fb, -3 * (n_chroma // 12), dims=1)
189
-
190
- return fb
@@ -1,36 +0,0 @@
1
- from ._conformer_wav2vec2 import (
2
- conformer_wav2vec2_base,
3
- conformer_wav2vec2_model,
4
- conformer_wav2vec2_pretrain_base,
5
- conformer_wav2vec2_pretrain_large,
6
- conformer_wav2vec2_pretrain_model,
7
- ConformerWav2Vec2PretrainModel,
8
- )
9
- from ._emformer_hubert import emformer_hubert_base, emformer_hubert_model
10
- from .conv_emformer import ConvEmformer
11
- from .hifi_gan import hifigan_vocoder, hifigan_vocoder_v1, hifigan_vocoder_v2, hifigan_vocoder_v3, HiFiGANVocoder
12
- from .rnnt import conformer_rnnt_base, conformer_rnnt_biasing, conformer_rnnt_biasing_base, conformer_rnnt_model
13
- from .rnnt_decoder import Hypothesis, RNNTBeamSearchBiasing
14
-
15
- __all__ = [
16
- "conformer_rnnt_base",
17
- "conformer_rnnt_model",
18
- "conformer_rnnt_biasing",
19
- "conformer_rnnt_biasing_base",
20
- "ConvEmformer",
21
- "conformer_wav2vec2_model",
22
- "conformer_wav2vec2_base",
23
- "conformer_wav2vec2_pretrain_model",
24
- "conformer_wav2vec2_pretrain_base",
25
- "conformer_wav2vec2_pretrain_large",
26
- "ConformerWav2Vec2PretrainModel",
27
- "emformer_hubert_base",
28
- "emformer_hubert_model",
29
- "Hypothesis",
30
- "RNNTBeamSearchBiasing",
31
- "HiFiGANVocoder",
32
- "hifigan_vocoder_v1",
33
- "hifigan_vocoder_v2",
34
- "hifigan_vocoder_v3",
35
- "hifigan_vocoder",
36
- ]